diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index f394eb05a53c90208ce8f6e23e700b5a3401949d..5e7fd1a48f01d7c553b0a12d2ef87eb36db5c913 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6859,6 +6859,13 @@ Note that KVM does not skip the faulting instruction as it does for KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state if it decides to decode and emulate the instruction. +This feature isn't available to protected VMs, as userspace does not +have access to the state that is required to perform the emulation. +Instead, a data abort exception is directly injected in the guest. +Note that although KVM_CAP_ARM_NISV_TO_USER will be reported if +queried outside of a protected VM context, the feature will not be +exposed if queried on a protected VM file descriptor. + :: /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ diff --git a/Documentation/virt/kvm/arm/fw-pseudo-registers.rst b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst new file mode 100644 index 0000000000000000000000000000000000000000..b90fd0b0fa666cb82ca91b06b1fb65dd9b8121de --- /dev/null +++ b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst @@ -0,0 +1,138 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +ARM firmware pseudo-registers interface +======================================= + +KVM handles the hypercall services as requested by the guests. New hypercall +services are regularly made available by the ARM specification or by KVM (as +vendor services) if they make sense from a virtualization point of view. + +This means that a guest booted on two different versions of KVM can observe +two different "firmware" revisions. This could cause issues if a given guest +is tied to a particular version of a hypercall service, or if a migration +causes a different version to be exposed out of the blue to an unsuspecting +guest. + +In order to remedy this situation, KVM exposes a set of "firmware +pseudo-registers" that can be manipulated using the GET/SET_ONE_REG +interface. These registers can be saved/restored by userspace, and set +to a convenient value as required. + +The following registers are defined: + +* KVM_REG_ARM_PSCI_VERSION: + + KVM implements the PSCI (Power State Coordination Interface) + specification in order to provide services such as CPU on/off, reset + and power-off to the guest. + + - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set + (and thus has already been initialized) + - Returns the current PSCI version on GET_ONE_REG (defaulting to the + highest PSCI version implemented by KVM and compatible with v0.2) + - Allows any PSCI version implemented by KVM and compatible with + v0.2 to be set with SET_ONE_REG + - Affects the whole VM (even if the register view is per-vcpu) + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + Holds the state of the firmware support to mitigate CVE-2017-5715, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_1 in [1]. + + Accepted values are: + + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: + KVM does not offer + firmware support for the workaround. The mitigation status for the + guest is unknown. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: + The workaround HVC call is + available to the guest and required for the mitigation. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: + The workaround HVC call + is available to the guest, but it is not needed on this VCPU. + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + Holds the state of the firmware support to mitigate CVE-2018-3639, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_2 in [1]_. + + Accepted values are: + + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + A workaround is not + available. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + The workaround state is + unknown. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + The workaround is available, + and can be disabled by a vCPU. If + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for + this vCPU. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + The workaround is always active on this vCPU or it is not needed. + + +Bitmap Feature Firmware Registers +--------------------------------- + +Contrary to the above registers, the following registers exposes the +hypercall services in the form of a feature-bitmap to the userspace. This +bitmap is translated to the services that are available to the guest. +There is a register defined per service call owner and can be accessed via +GET/SET_ONE_REG interface. + +By default, these registers are set with the upper limit of the features +that are supported. This way userspace can discover all the usable +hypercall services via GET_ONE_REG. The user-space can write-back the +desired bitmap back via SET_ONE_REG. The features for the registers that +are untouched, probably because userspace isn't aware of them, will be +exposed as is to the guest. + +Note that KVM will not allow the userspace to configure the registers +anymore once any of the vCPUs has run at least once. Instead, it will +return a -EBUSY. + +The pseudo-firmware bitmap register are as follows: + +* KVM_REG_ARM_STD_BMAP: + Controls the bitmap of the ARM Standard Secure Service Calls. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0: + The bit represents the services offered under v1.0 of ARM True Random + Number Generator (TRNG) specification, ARM DEN0098. + +* KVM_REG_ARM_STD_HYP_BMAP: + Controls the bitmap of the ARM Standard Hypervisor Service Calls. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME: + The bit represents the Paravirtualized Time service as represented by + ARM DEN0057A. + +* KVM_REG_ARM_VENDOR_HYP_BMAP: + Controls the bitmap of the Vendor specific Hypervisor Service Calls. + + The following bits are accepted: + + Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT + The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID + and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids. + + Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP: + The bit represents the Precision Time Protocol KVM service. + +Errors: + + ======= ============================================================= + -ENOENT Unknown register accessed. + -EBUSY Attempt a 'write' to the register after the VM has started. + -EINVAL Invalid bitmap written to the register. + ======= ============================================================= + +.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst index 3e23084644ba2aa680f1d2b232bc500f473254e2..17be111f493f56ca2e0b0d8ee80a269dd863242c 100644 --- a/Documentation/virt/kvm/arm/hypercalls.rst +++ b/Documentation/virt/kvm/arm/hypercalls.rst @@ -1,138 +1,46 @@ .. SPDX-License-Identifier: GPL-2.0 -======================= -ARM Hypercall Interface -======================= - -KVM handles the hypercall services as requested by the guests. New hypercall -services are regularly made available by the ARM specification or by KVM (as -vendor services) if they make sense from a virtualization point of view. - -This means that a guest booted on two different versions of KVM can observe -two different "firmware" revisions. This could cause issues if a given guest -is tied to a particular version of a hypercall service, or if a migration -causes a different version to be exposed out of the blue to an unsuspecting -guest. - -In order to remedy this situation, KVM exposes a set of "firmware -pseudo-registers" that can be manipulated using the GET/SET_ONE_REG -interface. These registers can be saved/restored by userspace, and set -to a convenient value as required. - -The following registers are defined: - -* KVM_REG_ARM_PSCI_VERSION: - - KVM implements the PSCI (Power State Coordination Interface) - specification in order to provide services such as CPU on/off, reset - and power-off to the guest. - - - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set - (and thus has already been initialized) - - Returns the current PSCI version on GET_ONE_REG (defaulting to the - highest PSCI version implemented by KVM and compatible with v0.2) - - Allows any PSCI version implemented by KVM and compatible with - v0.2 to be set with SET_ONE_REG - - Affects the whole VM (even if the register view is per-vcpu) - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - Holds the state of the firmware support to mitigate CVE-2017-5715, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_1 in [1]. - - Accepted values are: - - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: - KVM does not offer - firmware support for the workaround. The mitigation status for the - guest is unknown. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: - The workaround HVC call is - available to the guest and required for the mitigation. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: - The workaround HVC call - is available to the guest, but it is not needed on this VCPU. - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - Holds the state of the firmware support to mitigate CVE-2018-3639, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_2 in [1]_. - - Accepted values are: - - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: - A workaround is not - available. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: - The workaround state is - unknown. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: - The workaround is available, - and can be disabled by a vCPU. If - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for - this vCPU. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: - The workaround is always active on this vCPU or it is not needed. - - -Bitmap Feature Firmware Registers ---------------------------------- - -Contrary to the above registers, the following registers exposes the -hypercall services in the form of a feature-bitmap to the userspace. This -bitmap is translated to the services that are available to the guest. -There is a register defined per service call owner and can be accessed via -GET/SET_ONE_REG interface. - -By default, these registers are set with the upper limit of the features -that are supported. This way userspace can discover all the usable -hypercall services via GET_ONE_REG. The user-space can write-back the -desired bitmap back via SET_ONE_REG. The features for the registers that -are untouched, probably because userspace isn't aware of them, will be -exposed as is to the guest. - -Note that KVM will not allow the userspace to configure the registers -anymore once any of the vCPUs has run at least once. Instead, it will -return a -EBUSY. - -The pseudo-firmware bitmap register are as follows: - -* KVM_REG_ARM_STD_BMAP: - Controls the bitmap of the ARM Standard Secure Service Calls. - - The following bits are accepted: - - Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0: - The bit represents the services offered under v1.0 of ARM True Random - Number Generator (TRNG) specification, ARM DEN0098. - -* KVM_REG_ARM_STD_HYP_BMAP: - Controls the bitmap of the ARM Standard Hypervisor Service Calls. - - The following bits are accepted: - - Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME: - The bit represents the Paravirtualized Time service as represented by - ARM DEN0057A. - -* KVM_REG_ARM_VENDOR_HYP_BMAP: - Controls the bitmap of the Vendor specific Hypervisor Service Calls. - - The following bits are accepted: - - Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT - The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID - and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids. - - Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP: - The bit represents the Precision Time Protocol KVM service. - -Errors: - - ======= ============================================================= - -ENOENT Unknown register accessed. - -EBUSY Attempt a 'write' to the register after the VM has started. - -EINVAL Invalid bitmap written to the register. - ======= ============================================================= - -.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf +=============================================== +KVM/arm64-specific hypercalls exposed to guests +=============================================== + +This file documents the KVM/arm64-specific hypercalls which may be +exposed by KVM/arm64 to guest operating systems. These hypercalls are +issued using the HVC instruction according to version 1.1 of the Arm SMC +Calling Convention (DEN0028/C): + +https://developer.arm.com/docs/den0028/c + +All KVM/arm64-specific hypercalls are allocated within the "Vendor +Specific Hypervisor Service Call" range with a UID of +``28b46fb6-2ec5-11e9-a9ca-4b564d003a74``. This UID should be queried by the +guest using the standard "Call UID" function for the service range in +order to determine that the KVM/arm64-specific hypercalls are available. + +``ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID`` +--------------------------------------------- + +Provides a discovery mechanism for other KVM/arm64 hypercalls. + ++---------------------+-------------------------------------------------------------+ +| Presence: | Mandatory for the KVM/arm64 UID | ++---------------------+-------------------------------------------------------------+ +| Calling convention: | HVC32 | ++---------------------+----------+--------------------------------------------------+ +| Function ID: | (uint32) | 0x86000000 | ++---------------------+----------+--------------------------------------------------+ +| Arguments: | None | ++---------------------+----------+----+---------------------------------------------+ +| Return Values: | (uint32) | R0 | Bitmap of available function numbers 0-31 | +| +----------+----+---------------------------------------------+ +| | (uint32) | R1 | Bitmap of available function numbers 32-63 | +| +----------+----+---------------------------------------------+ +| | (uint32) | R2 | Bitmap of available function numbers 64-95 | +| +----------+----+---------------------------------------------+ +| | (uint32) | R3 | Bitmap of available function numbers 96-127 | ++---------------------+----------+----+---------------------------------------------+ + +``ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID`` +---------------------------------------- + +See ptp_kvm.rst diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index 7f231c724e16756f63b2a38e9e2df6a60727d2ee..ec09881de4cf6dce07da909d174f45baf460ca46 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -7,6 +7,7 @@ ARM .. toctree:: :maxdepth: 2 + fw-pseudo-registers hyp-abi hypercalls pvtime diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst index aecdc80ddcd857d7a688f7660e7b754f06d88446..7c0960970a0ee536634ecdbc9723dceee334ca98 100644 --- a/Documentation/virt/kvm/arm/ptp_kvm.rst +++ b/Documentation/virt/kvm/arm/ptp_kvm.rst @@ -7,19 +7,29 @@ PTP_KVM is used for high precision time sync between host and guests. It relies on transferring the wall clock and counter value from the host to the guest using a KVM-specific hypercall. -* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001 +``ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID`` +---------------------------------------- -This hypercall uses the SMC32/HVC32 calling convention: +Retrieve current time information for the specific counter. There are no +endianness restrictions. -ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID - ============== ======== ===================================== - Function ID: (uint32) 0x86000001 - Arguments: (uint32) KVM_PTP_VIRT_COUNTER(0) - KVM_PTP_PHYS_COUNTER(1) - Return Values: (int32) NOT_SUPPORTED(-1) on error, or - (uint32) Upper 32 bits of wall clock time (r0) - (uint32) Lower 32 bits of wall clock time (r1) - (uint32) Upper 32 bits of counter (r2) - (uint32) Lower 32 bits of counter (r3) - Endianness: No Restrictions. - ============== ======== ===================================== ++---------------------+-------------------------------------------------------+ +| Presence: | Optional | ++---------------------+-------------------------------------------------------+ +| Calling convention: | HVC32 | ++---------------------+----------+--------------------------------------------+ +| Function ID: | (uint32) | 0x86000001 | ++---------------------+----------+----+---------------------------------------+ +| Arguments: | (uint32) | R1 | ``KVM_PTP_VIRT_COUNTER (0)`` | +| | | +---------------------------------------+ +| | | | ``KVM_PTP_PHYS_COUNTER (1)`` | ++---------------------+----------+----+---------------------------------------+ +| Return Values: | (int32) | R0 | ``NOT_SUPPORTED (-1)`` on error, else | +| | | | upper 32 bits of wall clock time | +| +----------+----+---------------------------------------+ +| | (uint32) | R1 | Lower 32 bits of wall clock time | +| +----------+----+---------------------------------------+ +| | (uint32) | R2 | Upper 32 bits of counter | +| +----------+----+---------------------------------------+ +| | (uint32) | R3 | Lower 32 bits of counter | ++---------------------+----------+----+---------------------------------------+ diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 31f14ec4a65b6a592f1cc36dfab05a0b14819fdc..d62ba86ee1668e6a0491e4019324b7dd35588980 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -142,8 +142,9 @@ the cpu field to the processor id. :Architectures: ARM64 -2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER ------------------------------------------------------------------------------ +2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER, + KVM_ARM_VCPU_TIMER_IRQ_HVTIMER, KVM_ARM_VCPU_TIMER_IRQ_HPTIMER, +-------------------------------------------------------------------------------- :Parameters: in kvm_device_attr.addr the address for the timer interrupt is a pointer to an int @@ -159,10 +160,12 @@ A value describing the architected timer interrupt number when connected to an in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the attribute overrides the default values (see below). -============================= ========================================== -KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27) -KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30) -============================= ========================================== +============================== ========================================== +KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27) +KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30) +KVM_ARM_VCPU_TIMER_IRQ_HVTIMER The EL2 virtual timer intid (default: 28) +KVM_ARM_VCPU_TIMER_IRQ_HPTIMER The EL2 physical timer intid (default: 26) +============================== ========================================== Setting the same PPI for different timers will prevent the VCPUs from running. Setting the interrupt number on a VCPU configures all VCPUs created at that diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 219331f22aa35d0fc740cfe7f21d99c3eb0d3a91..03260752105f9dda9ba7a11759519fa92fb24a15 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1457,6 +1457,10 @@ config ARM64_PA_BITS default 48 if ARM64_PA_BITS_48 default 52 if ARM64_PA_BITS_52 +config ARM64_LPA2 + def_bool y + depends on ARM64_PA_BITS_52 && !ARM64_64K_PAGES + choice prompt "Endianness" default CPU_LITTLE_ENDIAN diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index dc636045a9eacbcd2da9e0407018392414dcd360..00c422e6f68db14ec15d34edc818c4886efce5af 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -866,6 +866,11 @@ static __always_inline bool system_supports_mpam_hcr(void) return alternative_has_cap_unlikely(ARM64_MPAM_HCR); } +static inline bool system_supports_lpa2(void) +{ + return cpus_have_final_cap(ARM64_HAS_LPA2); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5ffc884f080963509882726f8f062df8a7a3567b..828da60fb6a688b61e6e19479bded9c43caddc61 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -132,6 +132,7 @@ #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 +#define QCOM_CPU_PART_ORYON_X1 0x001 #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 @@ -219,6 +220,7 @@ #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) +#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1) #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 09561219128ce9f49800f28dc7d89bae66b84c0d..bd3e23fcbb7ce8a9fb6bc9a7cc9312b0e29986cf 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -146,7 +146,7 @@ /* Coprocessor traps */ .macro __init_el2_cptr __check_hvhe .LnVHE_\@, x1 - mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) + mov x0, #CPACR_ELx_FPEN msr cpacr_el1, x0 b .Lskip_set_cptr_\@ .LnVHE_\@: @@ -355,7 +355,7 @@ // (h)VHE case mrs x0, cpacr_el1 // Disable SVE traps - orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) + orr x0, x0, #CPACR_ELx_ZEN msr cpacr_el1, x0 b .Lskip_set_cptr_\@ @@ -376,7 +376,7 @@ // (h)VHE case mrs x0, cpacr_el1 // Disable SME traps - orr x0, x0, #(CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN) + orr x0, x0, #CPACR_ELx_SMEN msr cpacr_el1, x0 b .Lskip_set_cptr_sme_\@ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index b04575ea3a355ee46a96d8d6ff85733e135f952d..da5faaf69b644c346cb00c9c94b2bd22c18f80df 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -117,15 +117,17 @@ #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) -#define ESR_ELx_FSC_SEA_TTW0 (0x14) -#define ESR_ELx_FSC_SEA_TTW1 (0x15) -#define ESR_ELx_FSC_SEA_TTW2 (0x16) -#define ESR_ELx_FSC_SEA_TTW3 (0x17) +#define ESR_ELx_FSC_SEA_TTW(n) (0x14 + (n)) #define ESR_ELx_FSC_SECC (0x18) -#define ESR_ELx_FSC_SECC_TTW0 (0x1c) -#define ESR_ELx_FSC_SECC_TTW1 (0x1d) -#define ESR_ELx_FSC_SECC_TTW2 (0x1e) -#define ESR_ELx_FSC_SECC_TTW3 (0x1f) +#define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) + +/* Status codes for individual page table levels */ +#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) +#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n)) + +#define ESR_ELx_FSC_FAULT_nL (0x2C) +#define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \ + ESR_ELx_FSC_FAULT) + (n)) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) @@ -158,6 +160,8 @@ #define ESR_ELx_Xs_MASK (GENMASK_ULL(4, 0)) /* ISS field definitions for exceptions taken in to Hyp */ +#define ESR_ELx_FSC_ADDRSZ (0x00) +#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n)) #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) @@ -394,6 +398,49 @@ static inline bool esr_is_data_abort(unsigned long esr) return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; } +static inline bool esr_fsc_is_translation_fault(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_FAULT_L(3)) || + (esr == ESR_ELx_FSC_FAULT_L(2)) || + (esr == ESR_ELx_FSC_FAULT_L(1)) || + (esr == ESR_ELx_FSC_FAULT_L(0)) || + (esr == ESR_ELx_FSC_FAULT_L(-1)); +} + +static inline bool esr_fsc_is_permission_fault(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_PERM_L(3)) || + (esr == ESR_ELx_FSC_PERM_L(2)) || + (esr == ESR_ELx_FSC_PERM_L(1)) || + (esr == ESR_ELx_FSC_PERM_L(0)); +} + +static inline bool esr_fsc_is_access_flag_fault(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_ACCESS_L(3)) || + (esr == ESR_ELx_FSC_ACCESS_L(2)) || + (esr == ESR_ELx_FSC_ACCESS_L(1)) || + (esr == ESR_ELx_FSC_ACCESS_L(0)); +} + +/* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */ +static inline bool esr_iss_is_eretax(unsigned long esr) +{ + return esr & ESR_ELx_ERET_ISS_ERET; +} + +/* Indicate which key is used for ERETAx (false: A-Key, true: B-Key) */ +static inline bool esr_iss_is_eretab(unsigned long esr) +{ + return esr & ESR_ELx_ERET_ISS_ERETA; +} + const char *esr_get_class_string(unsigned long esr); #endif /* __ASSEMBLY */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 50bad0e59904d5f9a4d31ec12d938aa15815702a..9aa08fa49de4a2a0c8b2b5d5323b8d03b52e7a4c 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -106,11 +106,12 @@ #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) -#define HCRX_GUEST_FLAGS (HCRX_EL2_SMPME | HCRX_EL2_TCR2En) #define MPAMHCR_HOST_FLAGS 0 /* TCR_EL2 Registers bits */ +#define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) +#define TCR_EL2_HPD (1 << 24) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) @@ -124,6 +125,7 @@ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ +#define VTCR_EL2_DS TCR_EL2_DS #define VTCR_EL2_RES1 (1U << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) @@ -307,6 +309,12 @@ GENMASK(19, 14) | \ BIT(11)) +#define CPTR_VHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(27, 26) | \ + GENMASK(23, 22) | \ + GENMASK(19, 18) | \ + GENMASK(15, 0)) + /* * FGT register definitions * diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 3641f4e2d3a71f56c60d69f9df19d06a373abe87..c9f6757eced19e99f584dca464a6661e39dd350b 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -10,6 +10,7 @@ #include #include #include +#include #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) @@ -72,11 +73,8 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_range, __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, - __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr, - __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr, - __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, - __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs, - __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, @@ -249,15 +247,18 @@ extern void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t start, unsigned long pages); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); +extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); + extern void __kvm_timer_set_cntvoff(u64 cntvoff); +extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_gic_config(void); -extern u64 __vgic_v3_read_vmcr(void); -extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); #define __KVM_EXTABLE(from, to) \ @@ -274,7 +275,7 @@ extern void __vgic_v3_init_lrs(void); asm volatile( \ " mrs %1, spsr_el2\n" \ " mrs %2, elr_el2\n" \ - "1: at "at_op", %3\n" \ + "1: " __msr_s(at_op, "%3") "\n" \ " isb\n" \ " b 9f\n" \ "2: msr spsr_el2, %1\n" \ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 2e4901c8b72612c2ad1fe16b2a00b70da37d298c..c9722195ed7a25e6a890affd970b559082b4bf61 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -11,6 +11,7 @@ #ifndef __ARM64_KVM_EMULATE_H__ #define __ARM64_KVM_EMULATE_H__ +#include #include #include @@ -55,6 +56,14 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu); int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2); int kvm_inject_nested_irq(struct kvm_vcpu *vcpu); +static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu) +{ + u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SVE) | + ESR_ELx_IL; + + kvm_inject_nested_sync(vcpu, esr); +} + #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { @@ -63,45 +72,23 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) #else static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { - return test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features); + return vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); } #endif static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - if (has_vhe() || has_hvhe()) - vcpu->arch.hcr_el2 |= HCR_E2H; - if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) { - /* route synchronous external abort exceptions to EL2 */ - vcpu->arch.hcr_el2 |= HCR_TEA; - /* trap error record accesses */ - vcpu->arch.hcr_el2 |= HCR_TERR; - } + if (!vcpu_has_run_once(vcpu)) + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - vcpu->arch.hcr_el2 |= HCR_FWB; - } else { - /* - * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C - * get set in SCTLR_EL1 such that we can detect when the guest - * MMU gets turned on and do the necessary cache maintenance - * then. - */ + /* + * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C + * get set in SCTLR_EL1 such that we can detect when the guest + * MMU gets turned on and do the necessary cache maintenance + * then. + */ + if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) vcpu->arch.hcr_el2 |= HCR_TVM; - } - - if (cpus_have_final_cap(ARM64_HAS_EVT) && - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) - vcpu->arch.hcr_el2 |= HCR_TID4; - else - vcpu->arch.hcr_el2 |= HCR_TID2; - - if (vcpu_el1_is_32bit(vcpu)) - vcpu->arch.hcr_el2 &= ~HCR_RW; - - if (kvm_has_mte(vcpu->kvm)) - vcpu->arch.hcr_el2 |= HCR_ATA; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) @@ -125,16 +112,6 @@ static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_TWI; } -static inline void vcpu_ptrauth_enable(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); -} - -static inline void vcpu_ptrauth_disable(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); -} - static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu) { return vcpu->arch.vsesr_el2; @@ -207,29 +184,30 @@ static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu) return vcpu_is_el2_ctxt(&vcpu->arch.ctxt); } -static inline bool __vcpu_el2_e2h_is_set(const struct kvm_cpu_context *ctxt) +static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) { return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || - (ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H)); + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_E2H)); } -static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) +static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu) { - return __vcpu_el2_e2h_is_set(&vcpu->arch.ctxt); + return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE; } -static inline bool __vcpu_el2_tge_is_set(const struct kvm_cpu_context *ctxt) +static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) { - return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_TGE; -} + bool e2h, tge; + u64 hcr; -static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu) -{ - return __vcpu_el2_tge_is_set(&vcpu->arch.ctxt); -} + if (!vcpu_has_nv(vcpu)) + return false; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + e2h = (hcr & HCR_E2H); + tge = (hcr & HCR_TGE); -static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt) -{ /* * We are in a hypervisor context if the vcpu mode is EL2 or * E2H and TGE bits are set. The latter means we are in the user space @@ -238,14 +216,12 @@ static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt) * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the * rest of the KVM code, and will result in a misbehaving guest. */ - return vcpu_is_el2_ctxt(ctxt) || - (__vcpu_el2_e2h_is_set(ctxt) && __vcpu_el2_tge_is_set(ctxt)) || - __vcpu_el2_tge_is_set(ctxt); + return vcpu_is_el2(vcpu) || (e2h && tge) || tge; } -static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) +static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu) { - return vcpu_has_nv(vcpu) && __is_hyp_ctxt(&vcpu->arch.ctxt); + return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu); } /* @@ -401,29 +377,34 @@ static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; } -static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) +static inline +bool kvm_vcpu_trap_is_permission_fault(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE; + return esr_fsc_is_permission_fault(kvm_vcpu_get_esr(vcpu)); } -static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) +static inline +bool kvm_vcpu_trap_is_translation_fault(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL; + return esr_fsc_is_translation_fault(kvm_vcpu_get_esr(vcpu)); +} + +static inline +u64 kvm_vcpu_trap_get_perm_fault_granule(const struct kvm_vcpu *vcpu) +{ + unsigned long esr = kvm_vcpu_get_esr(vcpu); + + BUG_ON(!esr_fsc_is_permission_fault(esr)); + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(esr & ESR_ELx_FSC_LEVEL)); } static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) { switch (kvm_vcpu_trap_get_fault(vcpu)) { case ESR_ELx_FSC_EXTABT: - case ESR_ELx_FSC_SEA_TTW0: - case ESR_ELx_FSC_SEA_TTW1: - case ESR_ELx_FSC_SEA_TTW2: - case ESR_ELx_FSC_SEA_TTW3: + case ESR_ELx_FSC_SEA_TTW(-1) ... ESR_ELx_FSC_SEA_TTW(3): case ESR_ELx_FSC_SECC: - case ESR_ELx_FSC_SECC_TTW0: - case ESR_ELx_FSC_SECC_TTW1: - case ESR_ELx_FSC_SECC_TTW2: - case ESR_ELx_FSC_SECC_TTW3: + case ESR_ELx_FSC_SECC_TTW(-1) ... ESR_ELx_FSC_SECC_TTW(3): return true; default: return false; @@ -451,12 +432,7 @@ static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) * first), then a permission fault to allow the flags * to be set. */ - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { - case ESR_ELx_FSC_PERM: - return true; - default: - return false; - } + return kvm_vcpu_trap_is_permission_fault(vcpu); } if (kvm_vcpu_trap_is_iabt(vcpu)) @@ -567,6 +543,68 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) +#define __build_check_all_or_none(r, bits) \ + BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) + +#define __cpacr_to_cptr_clr(clr, set) \ + ({ \ + u64 cptr = 0; \ + \ + if ((set) & CPACR_ELx_FPEN) \ + cptr |= CPTR_EL2_TFP; \ + if ((set) & CPACR_ELx_ZEN) \ + cptr |= CPTR_EL2_TZ; \ + if ((set) & CPACR_ELx_SMEN) \ + cptr |= CPTR_EL2_TSM; \ + if ((clr) & CPACR_ELx_TTA) \ + cptr |= CPTR_EL2_TTA; \ + if ((clr) & CPTR_EL2_TAM) \ + cptr |= CPTR_EL2_TAM; \ + if ((clr) & CPTR_EL2_TCPAC) \ + cptr |= CPTR_EL2_TCPAC; \ + \ + cptr; \ + }) + +#define __cpacr_to_cptr_set(clr, set) \ + ({ \ + u64 cptr = 0; \ + \ + if ((clr) & CPACR_ELx_FPEN) \ + cptr |= CPTR_EL2_TFP; \ + if ((clr) & CPACR_ELx_ZEN) \ + cptr |= CPTR_EL2_TZ; \ + if ((clr) & CPACR_ELx_SMEN) \ + cptr |= CPTR_EL2_TSM; \ + if ((set) & CPACR_ELx_TTA) \ + cptr |= CPTR_EL2_TTA; \ + if ((set) & CPTR_EL2_TAM) \ + cptr |= CPTR_EL2_TAM; \ + if ((set) & CPTR_EL2_TCPAC) \ + cptr |= CPTR_EL2_TCPAC; \ + \ + cptr; \ + }) + +#define cpacr_clear_set(clr, set) \ + do { \ + BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ + BUILD_BUG_ON((clr) & CPACR_ELx_E0POE); \ + __build_check_all_or_none((clr), CPACR_ELx_FPEN); \ + __build_check_all_or_none((set), CPACR_ELx_FPEN); \ + __build_check_all_or_none((clr), CPACR_ELx_ZEN); \ + __build_check_all_or_none((set), CPACR_ELx_ZEN); \ + __build_check_all_or_none((clr), CPACR_ELx_SMEN); \ + __build_check_all_or_none((set), CPACR_ELx_SMEN); \ + \ + if (has_vhe() || has_hvhe()) \ + sysreg_clear_set(cpacr_el1, clr, set); \ + else \ + sysreg_clear_set(cptr_el2, \ + __cpacr_to_cptr_clr(clr, set), \ + __cpacr_to_cptr_set(clr, set));\ + } while (0) + static __always_inline void kvm_write_cptr_el2(u64 val) { if (has_vhe() || has_hvhe()) @@ -575,40 +613,81 @@ static __always_inline void kvm_write_cptr_el2(u64 val) write_sysreg(val, cptr_el2); } -static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) +/* Resets the value of cptr_el2 when returning to the host. */ +static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) { u64 val; if (has_vhe()) { - val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | - CPACR_EL1_ZEN_EL1EN); + val = (CPACR_ELx_FPEN | CPACR_EL1_ZEN_EL1EN); if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN_EL1EN; } else if (has_hvhe()) { - val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + val = CPACR_ELx_FPEN; - if (!vcpu_has_sve(vcpu) || - (*host_data_ptr(fp_owner) != FP_STATE_GUEST_OWNED)) - val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN; + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPACR_ELx_ZEN; if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN; + val |= CPACR_ELx_SMEN; } else { val = CPTR_NVHE_EL2_RES1; - if (vcpu_has_sve(vcpu) && - (*host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED)) + if (vcpu_has_sve(vcpu) && guest_owns_fp_regs()) val |= CPTR_EL2_TZ; if (cpus_have_final_cap(ARM64_SME)) val &= ~CPTR_EL2_TSM; } - return val; + kvm_write_cptr_el2(val); } -static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) +/* + * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE + * format if E2H isn't set. + */ +static inline u64 vcpu_sanitised_cptr_el2(const struct kvm_vcpu *vcpu) { - u64 val = kvm_get_reset_cptr_el2(vcpu); + u64 cptr = __vcpu_sys_reg(vcpu, CPTR_EL2); - kvm_write_cptr_el2(val); + if (!vcpu_el2_e2h_is_set(vcpu)) + cptr = translate_cptr_el2_to_cpacr_el1(cptr); + + return cptr; +} + +static inline bool ____cptr_xen_trap_enabled(const struct kvm_vcpu *vcpu, + unsigned int xen) +{ + switch (xen) { + case 0b00: + case 0b10: + return true; + case 0b01: + return vcpu_el2_tge_is_set(vcpu) && !vcpu_is_el2(vcpu); + case 0b11: + default: + return false; + } +} + +#define __guest_hyp_cptr_xen_trap_enabled(vcpu, xen) \ + (!vcpu_has_nv(vcpu) ? false : \ + ____cptr_xen_trap_enabled(vcpu, \ + SYS_FIELD_GET(CPACR_ELx, xen, \ + vcpu_sanitised_cptr_el2(vcpu)))) + +static inline bool guest_hyp_fpsimd_traps_enabled(const struct kvm_vcpu *vcpu) +{ + return __guest_hyp_cptr_xen_trap_enabled(vcpu, FPEN); +} + +static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) +{ + return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); +} + +static inline void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +{ + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); } #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f78747a9508f6389c63de156030bada0ca435ec5..8ce75badd0dfbf2fd190bfdb63b0dfd1a2c83e89 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -51,6 +51,7 @@ #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) #define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) +#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -78,7 +79,7 @@ extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); -int kvm_reset_vcpu(struct kvm_vcpu *vcpu); +void kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); struct kvm_hyp_memcache { @@ -187,6 +188,39 @@ struct kvm_s2_mmu { uint64_t split_page_chunk_size; struct kvm_arch *arch; + + /* + * For a shadow stage-2 MMU, the virtual vttbr used by the + * host to parse the guest S2. + * This either contains: + * - the virtual VTTBR programmed by the guest hypervisor with + * CnP cleared + * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid + * + * We also cache the full VTCR which gets used for TLB invalidation, + * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted + * to be cached in a TLB" to the letter. + */ + u64 tlb_vttbr; + u64 tlb_vtcr; + + /* + * true when this represents a nested context where virtual + * HCR_EL2.VM == 1 + */ + bool nested_stage2_enabled; + + /* + * true when this MMU needs to be unmapped before being used for a new + * purpose. + */ + bool pending_unmap; + + /* + * 0: Nobody is currently using this, check vttbr for validity + * >0: Somebody is actively using this. + */ + atomic_t refcnt; }; struct kvm_arch_memory_slot { @@ -210,6 +244,7 @@ typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache teardown_mc; + bool enabled; }; struct kvm_mpidr_data { @@ -263,6 +298,14 @@ struct kvm_arch { */ u64 fgu[__NR_FGT_GROUP_IDS__]; + /* + * Stage 2 paging state for VMs with nested S2 using a virtual + * VMID. + */ + struct kvm_s2_mmu *nested_mmus; + size_t nested_mmus_size; + int nested_mmus_next; + /* Interrupt controller */ struct vgic_dist vgic; @@ -334,12 +377,12 @@ struct kvm_arch { * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. */ #define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) -#define IDX_IDREG(idx) sys_reg(3, 0, 0, ((idx) >> 3) + 1, (idx) & Op2_mask) -#define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)]) #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) u64 id_regs[KVM_ARM_ID_REG_NUM]; - /* Masks for VNCR-baked sysregs */ + u64 ctr_el0; + + /* Masks for VNCR-backed and general EL2 sysregs */ struct kvm_sysreg_masks *sysreg_masks; /* @@ -375,6 +418,9 @@ struct kvm_vcpu_fault_info { r = __VNCR_START__ + ((VNCR_ ## r) / 8), \ __after_##r = __MAX__(__before_##r - 1, r) +#define MARKER(m) \ + m, __after_##m = m - 1 + enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ @@ -429,12 +475,14 @@ enum vcpu_sysreg { /* EL2 registers */ SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ - MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ + ZCR_EL2, /* SVE Control Register (EL2) */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ + PIRE0_EL2, /* Permission Indirection Register 0 (EL2) */ + PIR_EL2, /* Permission Indirection Register 1 (EL2) */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ @@ -447,14 +495,20 @@ enum vcpu_sysreg { VBAR_EL2, /* Vector Base Address Register (EL2) */ RVBAR_EL2, /* Reset Vector Base Address Register */ CONTEXTIDR_EL2, /* Context ID Register (EL2) */ - CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ SP_EL2, /* EL2 Stack Pointer */ CNTHP_CTL_EL2, CNTHP_CVAL_EL2, CNTHV_CTL_EL2, CNTHV_CVAL_EL2, - __VNCR_START__, /* Any VNCR-capable reg goes after this point */ + /* Anything from this can be RES0/RES1 sanitised */ + MARKER(__SANITISED_REG_START__), + TCR2_EL2, /* Extended Translation Control Register (EL2) */ + MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ + CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ + + /* Any VNCR-capable reg goes after this point */ + MARKER(__VNCR_START__), VNCR(SCTLR_EL1),/* System Control Register */ VNCR(ACTLR_EL1),/* Auxiliary Control Register */ @@ -503,6 +557,8 @@ enum vcpu_sysreg { VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), + VNCR(ICH_HCR_EL2), + NR_SYS_REGS /* Nothing after this line! */ }; @@ -510,7 +566,7 @@ struct kvm_sysreg_masks { struct { u64 res0; u64 res1; - } mask[NR_SYS_REGS - __VNCR_START__]; + } mask[NR_SYS_REGS - __SANITISED_REG_START__]; }; struct kvm_cpu_context { @@ -531,6 +587,20 @@ struct kvm_cpu_context { u64 *vncr_array; }; +struct cpu_sve_state { + __u64 zcr_el1; + + /* + * Ordering is important since __sve_save_state/__sve_restore_state + * relies on it. + */ + __u32 fpsr; + __u32 fpcr; + + /* Must be SVE_VQ_BYTES (128 bit) aligned. */ + __u8 sve_regs[]; +}; + /* * This structure is instantiated on a per-CPU basis, and contains * data that is: @@ -555,7 +625,15 @@ struct kvm_host_data { unsigned long flags; struct kvm_cpu_context host_ctxt; - struct user_fpsimd_state *fpsimd_state; /* hyp VA */ + + /* + * Hyp VA. + * sve_state is only used in pKVM and if system_supports_sve(). + */ + struct cpu_sve_state *sve_state; + + /* Used by pKVM only. */ + u64 fpmr; /* Ownership of the FP regs */ enum { @@ -681,8 +759,6 @@ struct kvm_vcpu_arch { struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; - struct task_struct *parent_task; - /* VGIC state */ struct vgic_cpu vgic_cpu; struct arch_timer_cpu timer_cpu; @@ -707,9 +783,6 @@ struct kvm_vcpu_arch { /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; - /* feature flags */ - DECLARE_BITMAP(features, KVM_VCPU_MAX_FEATURES); - /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ u64 vsesr_el2; @@ -876,6 +949,9 @@ struct kvm_vcpu_arch { #define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl) +#define vcpu_sve_zcr_elx(vcpu) \ + (unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1) + #define vcpu_sve_state_size(vcpu) ({ \ size_t __size_ret; \ unsigned int __vcpu_vq; \ @@ -928,7 +1004,7 @@ struct kvm_vcpu_arch { * Don't bother with VNCR-based accesses in the nVHE code, it has no * business dealing with NV. */ -static inline u64 *__ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) +static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) { #if !defined (__KVM_NVHE_HYPERVISOR__) if (unlikely(cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && @@ -938,15 +1014,22 @@ static inline u64 *__ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) return (u64 *)&ctxt->sys_regs[r]; } +#define __ctxt_sys_reg(c,r) \ + ({ \ + BUILD_BUG_ON(__builtin_constant_p(r) && \ + (r) >= NR_SYS_REGS); \ + ___ctxt_sys_reg(c, r); \ + }) + #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *, enum vcpu_sysreg); +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ - if (vcpu_has_nv((v)) && (r) >= __VNCR_START__) \ - *__r = kvm_vcpu_sanitise_vncr_reg((v), (r)); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + *__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\ __r; \ })) @@ -993,6 +1076,7 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; + case ZCR_EL1: *val = read_sysreg_s(SYS_ZCR_EL12); break; default: return false; } @@ -1038,6 +1122,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; + case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break; default: return false; } @@ -1149,7 +1234,7 @@ int __init populate_nv_trap_config(void); bool lock_all_vcpus(struct kvm *kvm); void unlock_all_vcpus(struct kvm *kvm); -void kvm_init_sysreg(struct kvm_vcpu *); +void kvm_calculate_traps(struct kvm_vcpu *vcpu); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); @@ -1266,6 +1351,18 @@ DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data); &this_cpu_ptr_hyp_sym_wrapper(kvm_host_data)->f) #endif +/* Check whether the FP regs are owned by the guest */ +static inline bool guest_owns_fp_regs(void) +{ + return *host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED; +} + +/* Check whether the FP regs are owned by the host */ +static inline bool host_owns_fp_regs(void) +{ + return *host_data_ptr(fp_owner) == FP_STATE_HOST_OWNED; +} + static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { /* The host's MPIDR is immutable, so let's set it up at boot time */ @@ -1309,7 +1406,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu); -void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu); static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) { @@ -1337,6 +1433,7 @@ void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu); int __init kvm_set_ipa_limit(void); +u32 kvm_get_pa_bits(struct kvm *kvm); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); @@ -1345,10 +1442,9 @@ struct kvm *kvm_arch_alloc_vm(void); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE -static inline bool kvm_vm_is_protected(struct kvm *kvm) -{ - return false; -} +#define kvm_vm_is_protected(kvm) (is_protected_kvm_enabled() && (kvm)->arch.pkvm.enabled) + +#define vcpu_is_protected(vcpu) kvm_vm_is_protected((vcpu)->kvm) int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); @@ -1373,6 +1469,8 @@ static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature) #define vcpu_has_feature(v, f) __vcpu_has_feature(&(v)->kvm->arch, (f)) +#define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED) + int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM extern phys_addr_t hyp_mem_base; @@ -1385,6 +1483,24 @@ static inline void kvm_hyp_reserve(void) { } void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); +static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg) +{ + switch (reg) { + case sys_reg(3, 0, 0, 1, 0) ... sys_reg(3, 0, 0, 7, 7): + return &ka->id_regs[IDREG_IDX(reg)]; + case SYS_CTR_EL0: + return &ka->ctr_el0; + default: + WARN_ON_ONCE(1); + return NULL; + } +} + +#define kvm_read_vm_id_reg(kvm, reg) \ + ({ u64 __val = *__vm_id_reg(&(kvm)->arch, reg); __val; }) + +void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); + #define __expand_field_sign_unsigned(id, fld, val) \ ((u64)SYS_FIELD_VALUE(id, fld, val)) @@ -1394,14 +1510,9 @@ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); sign_extend64(__val, id##_##fld##_WIDTH - 1); \ }) -#define expand_field_sign(id, fld, val) \ - (id##_##fld##_SIGNED ? \ - __expand_field_sign_signed(id, fld, val) : \ - __expand_field_sign_unsigned(id, fld, val)) - #define get_idreg_field_unsigned(kvm, id, fld) \ ({ \ - u64 __val = IDREG((kvm), SYS_##id); \ + u64 __val = kvm_read_vm_id_reg((kvm), SYS_##id); \ FIELD_GET(id##_##fld##_MASK, __val); \ }) @@ -1414,20 +1525,26 @@ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); #define get_idreg_field_enum(kvm, id, fld) \ get_idreg_field_unsigned(kvm, id, fld) -#define get_idreg_field(kvm, id, fld) \ +#define kvm_cmp_feat_signed(kvm, id, fld, op, limit) \ + (get_idreg_field_signed((kvm), id, fld) op __expand_field_sign_signed(id, fld, limit)) + +#define kvm_cmp_feat_unsigned(kvm, id, fld, op, limit) \ + (get_idreg_field_unsigned((kvm), id, fld) op __expand_field_sign_unsigned(id, fld, limit)) + +#define kvm_cmp_feat(kvm, id, fld, op, limit) \ (id##_##fld##_SIGNED ? \ - get_idreg_field_signed(kvm, id, fld) : \ - get_idreg_field_unsigned(kvm, id, fld)) + kvm_cmp_feat_signed(kvm, id, fld, op, limit) : \ + kvm_cmp_feat_unsigned(kvm, id, fld, op, limit)) #define kvm_has_feat(kvm, id, fld, limit) \ - (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, limit)) + kvm_cmp_feat(kvm, id, fld, >=, limit) #define kvm_has_feat_enum(kvm, id, fld, val) \ - (get_idreg_field_unsigned((kvm), id, fld) == __expand_field_sign_unsigned(id, fld, val)) + kvm_cmp_feat_unsigned(kvm, id, fld, ==, val) #define kvm_has_feat_range(kvm, id, fld, min, max) \ - (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, min) && \ - get_idreg_field((kvm), id, fld) <= expand_field_sign(id, fld, max)) + (kvm_cmp_feat(kvm, id, fld, >=, min) && \ + kvm_cmp_feat(kvm, id, fld, <=, max)) #ifdef CONFIG_KVM_ARM_HOST_VHE_ONLY struct kvm_pmu_ops { @@ -1490,4 +1607,23 @@ static inline void host_kvm_vcpu_pmu_resync_el0(void) u32 kvm_pv_cpu_freq_get(struct kvm_vcpu *vcpu); +/* Check for a given level of PAuth support */ +#define kvm_has_pauth(k, l) \ + ({ \ + bool pa, pi, pa3; \ + \ + pa = kvm_has_feat((k), ID_AA64ISAR1_EL1, APA, l); \ + pa &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPA, IMP); \ + pi = kvm_has_feat((k), ID_AA64ISAR1_EL1, API, l); \ + pi &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPI, IMP); \ + pa3 = kvm_has_feat((k), ID_AA64ISAR2_EL1, APA3, l); \ + pa3 &= kvm_has_feat((k), ID_AA64ISAR2_EL1, GPA3, IMP); \ + \ + (pa + pi + pa3) == 1; \ + }) + +#define kvm_has_fpmr(k) \ + (system_supports_fpmr() && \ + kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 01d0028b355132b296b479d872d851bc0df9bd51..2f13cbe4e63d8b47dd3ee42da1b0703bde15495f 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -84,8 +84,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); #ifdef __KVM_NVHE_HYPERVISOR__ @@ -115,7 +115,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); -void __sve_restore_state(void *sve_pffr, u32 *fpsr); +void __sve_save_state(void *sve_pffr, u32 *fpsr, int save_ffr); +void __sve_restore_state(void *sve_pffr, u32 *fpsr, int restore_ffr); u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 363bb900758da81f0c0cd5f1aacc2397babccb81..ff4ac49fa80d15f92a0874cb210b22a268eec77a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -119,6 +119,7 @@ alternative_cb_end #include #include #include +#include #ifdef CONFIG_KVM_ARM_HOST_VHE_ONLY static inline void kvm_compute_layout(void) @@ -183,6 +184,11 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr); void __init free_hyp_pgds(void); +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block); +void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); +void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); + void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); void kvm_uninit_stage2_mmu(struct kvm *kvm); @@ -327,6 +333,26 @@ static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) return container_of(mmu->arch, struct kvm, arch); } +static inline u64 get_vmid(u64 vttbr) +{ + return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >> + VTTBR_VMID_SHIFT; +} + +static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu) +{ + return !(mmu->tlb_vttbr & VTTBR_CNP_BIT); +} + +static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +{ + /* + * Be careful, mmu may not be fully initialised so do look at + * *any* of its fields. + */ + return &kvm->arch.mmu != mmu; +} + /* * ARM64 KVM relies on a simple conversion from physaddr to a kernel * virtual address (KVA) when it does cache maintenance as the CMO diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 053226eae4e550326a4839726120f0eef3ced08d..ee8bcef28144c554f55940e12a63d8f2b02a0362 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -3,14 +3,15 @@ #define __ARM64_KVM_NESTED_H #include -#include #include +#include +#include static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) { return (!__is_defined(__KVM_NVHE_HYPERVISOR__) && cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && - test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)); + vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2)); } /* Translation helpers from non-VHE EL2 to EL1 */ @@ -32,7 +33,7 @@ static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2) { - u64 cpacr_el1 = 0; + u64 cpacr_el1 = CPACR_ELx_RES1; if (cptr_el2 & CPTR_EL2_TTA) cpacr_el1 |= CPACR_ELx_TTA; @@ -41,6 +42,8 @@ static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2) if (!(cptr_el2 & CPTR_EL2_TZ)) cpacr_el1 |= CPACR_ELx_ZEN; + cpacr_el1 |= cptr_el2 & (CPTR_EL2_TCPAC | CPTR_EL2_TAM); + return cpacr_el1; } @@ -60,8 +63,143 @@ static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) return ttbr0 & ~GENMASK_ULL(63, 48); } +extern bool forward_smc_trap(struct kvm_vcpu *vcpu); +extern void kvm_init_nested(struct kvm *kvm); +extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); +extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); +extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu); + +union tlbi_info; + +extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, + const union tlbi_info *info, + void (*)(struct kvm_s2_mmu *, + const union tlbi_info *)); +extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); +extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); + +extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); + +struct kvm_s2_trans { + phys_addr_t output; + unsigned long block_size; + bool writable; + bool readable; + int level; + u32 esr; + u64 desc; +}; + +static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) +{ + return trans->output; +} + +static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans) +{ + return trans->block_size; +} + +static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans) +{ + return trans->esr; +} + +static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans) +{ + return trans->readable; +} + +static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) +{ + return trans->writable; +} + +static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) +{ + return !(trans->desc & BIT(54)); +} + +extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, + struct kvm_s2_trans *result); +extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, + struct kvm_s2_trans *trans); +extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); +extern void kvm_nested_s2_wp(struct kvm *kvm); +extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); +extern void kvm_nested_s2_flush(struct kvm *kvm); + +unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val); + +static inline bool kvm_supported_tlbi_s1e1_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + + if (!(sys_reg_Op0(instr) == TLBI_Op0 && + sys_reg_Op1(instr) == TLBI_Op1_EL1)) + return false; + + if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || + (sys_reg_CRn(instr) == TLBI_CRn_nXS && + kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) + return false; + + if (CRm == TLBI_CRm_nROS && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || + CRm == TLBI_CRm_RNS) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + return true; +} + +static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + + if (!(sys_reg_Op0(instr) == TLBI_Op0 && + sys_reg_Op1(instr) == TLBI_Op1_EL2)) + return false; + + if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || + (sys_reg_CRn(instr) == TLBI_CRn_nXS && + kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) + return false; + + if (CRm == TLBI_CRm_IPAIS || CRm == TLBI_CRm_IPAONS) + return false; + + if (CRm == TLBI_CRm_nROS && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || + CRm == TLBI_CRm_RNS) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + return true; +} + +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu); +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val); -int kvm_init_nv_sysregs(struct kvm *kvm); +#ifdef CONFIG_ARM64_PTR_AUTH +bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr); +#else +static inline bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) +{ + /* We really should never execute this... */ + WARN_ON_ONCE(1); + *elr = 0xbad9acc0debadbad; + return false; +} +#endif #define vncr_fixmap(c) \ ({ \ @@ -70,4 +208,47 @@ int kvm_init_nv_sysregs(struct kvm *kvm); (FIX_VNCR - __c); \ }) +#define KVM_NV_GUEST_MAP_SZ (KVM_PGTABLE_PROT_SW1 | KVM_PGTABLE_PROT_SW0) + +static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) +{ + return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); +} + +/* Adjust alignment for the contiguous bit as per StageOA() */ +#define contiguous_bit_shift(d, wi, l) \ + ({ \ + u8 shift = 0; \ + \ + if ((d) & PTE_CONT) { \ + switch (BIT((wi)->pgshift)) { \ + case SZ_4K: \ + shift = 4; \ + break; \ + case SZ_16K: \ + shift = (l) == 2 ? 5 : 7; \ + break; \ + case SZ_64K: \ + shift = 5; \ + break; \ + } \ + } \ + \ + shift; \ + }) + +static inline unsigned int ps_to_output_size(unsigned int ps) +{ + switch (ps) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: + default: + return 48; + } +} + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 28446403e63c2a088dca64bfc234bc3da993f0e4..19278dfe79782561013d4c2b03e9a866615479b0 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -11,7 +11,8 @@ #include #include -#define KVM_PGTABLE_MAX_LEVELS 4U +#define KVM_PGTABLE_FIRST_LEVEL -1 +#define KVM_PGTABLE_LAST_LEVEL 3 /* * The largest supported block sizes for KVM (no 52-bit PA support): @@ -20,19 +21,29 @@ * - 64K (level 2): 512MB */ #ifdef CONFIG_ARM64_4K_PAGES -#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1U +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1 #else -#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2 #endif -#define kvm_lpa2_is_enabled() false +#define kvm_lpa2_is_enabled() system_supports_lpa2() + +static inline u64 kvm_get_parange_max(void) +{ + if (kvm_lpa2_is_enabled() || + (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SHIFT == 16)) + return ID_AA64MMFR0_EL1_PARANGE_52; + else + return ID_AA64MMFR0_EL1_PARANGE_48; +} static inline u64 kvm_get_parange(u64 mmfr0) { + u64 parange_max = kvm_get_parange_max(); u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - if (parange > ID_AA64MMFR0_EL1_PARANGE_MAX) - parange = ID_AA64MMFR0_EL1_PARANGE_MAX; + if (parange > parange_max) + parange = parange_max; return parange; } @@ -43,6 +54,8 @@ typedef u64 kvm_pte_t; #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) +#define KVM_PTE_ADDR_MASK_LPA2 GENMASK(49, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_50_LPA2 GENMASK(9, 8) #define KVM_PHYS_INVALID (-1ULL) @@ -53,21 +66,34 @@ static inline bool kvm_pte_valid(kvm_pte_t pte) static inline u64 kvm_pte_to_phys(kvm_pte_t pte) { - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + u64 pa; + + if (kvm_lpa2_is_enabled()) { + pa = pte & KVM_PTE_ADDR_MASK_LPA2; + pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50; + } else { + pa = pte & KVM_PTE_ADDR_MASK; + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + } return pa; } static inline kvm_pte_t kvm_phys_to_pte(u64 pa) { - kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) { - pa &= GENMASK(51, 48); - pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + kvm_pte_t pte; + + if (kvm_lpa2_is_enabled()) { + pte = pa & KVM_PTE_ADDR_MASK_LPA2; + pa &= GENMASK(51, 50); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50); + } else { + pte = pa & KVM_PTE_ADDR_MASK; + if (PAGE_SHIFT == 16) { + pa &= GENMASK(51, 48); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + } } return pte; @@ -78,28 +104,28 @@ static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte) return __phys_to_pfn(kvm_pte_to_phys(pte)); } -static inline u64 kvm_granule_shift(u32 level) +static inline u64 kvm_granule_shift(s8 level) { - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + /* Assumes KVM_PGTABLE_LAST_LEVEL is 3 */ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); } -static inline u64 kvm_granule_size(u32 level) +static inline u64 kvm_granule_size(s8 level) { return BIT(kvm_granule_shift(level)); } -static inline bool kvm_level_supports_block_mapping(u32 level) +static inline bool kvm_level_supports_block_mapping(s8 level) { return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } static inline u32 kvm_supported_block_sizes(void) { - u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; + s8 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; u32 r = 0; - for (; level < KVM_PGTABLE_MAX_LEVELS; level++) + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) r |= BIT(kvm_granule_shift(level)); return r; @@ -144,7 +170,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); - void (*free_unlinked_table)(void *addr, u32 level); + void (*free_unlinked_table)(void *addr, s8 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); @@ -242,7 +268,7 @@ struct kvm_pgtable_visit_ctx { u64 start; u64 addr; u64 end; - u32 level; + s8 level; enum kvm_pgtable_walk_flags flags; }; @@ -345,7 +371,7 @@ static inline bool kvm_pgtable_walk_lock_held(void) */ struct kvm_pgtable { u32 ia_bits; - u32 start_level; + s8 start_level; kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; @@ -479,7 +505,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ -void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); /** * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. @@ -503,7 +529,7 @@ void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *p * an ERR_PTR(error) on failure. */ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, - u64 phys, u32 level, + u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte); @@ -729,7 +755,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level); + kvm_pte_t *ptep, s8 *level); /** * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index eac2a015a868ec939cbc604a1c0c5e29aa79ed39..7eb55bb52e0c9f50200c0f97983756a45871fe71 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -36,6 +36,31 @@ int pkvm_create_hyp_vm(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); #endif +/* + * This functions as an allow-list of protected VM capabilities. + * Features not explicitly allowed by this function are denied. + */ +static inline bool kvm_pvm_ext_allowed(long ext) +{ + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_NR_VCPUS: + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + case KVM_CAP_MSI_DEVID: + case KVM_CAP_ARM_VM_IPA_SIZE: + case KVM_CAP_ARM_PMU_V3: + case KVM_CAP_ARM_SVE: + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + case KVM_CAP_ARM_PTRAUTH_GENERIC: + return true; + default: + return false; + } +} + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); @@ -72,10 +97,11 @@ static inline unsigned long hyp_vm_table_pages(void) static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { - unsigned long total = 0, i; + unsigned long total = 0; + int i; /* Provision the worst case scenario */ - for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { + for (i = KVM_PGTABLE_FIRST_LEVEL; i <= KVM_PGTABLE_LAST_LEVEL; i++) { nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); total += nr_pages; } @@ -143,4 +169,13 @@ static inline unsigned long hyp_ffa_proxy_pages(void) return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); } +static inline size_t pkvm_host_sve_state_size(void) +{ + if (!system_supports_sve()) + return 0; + + return size_add(sizeof(struct cpu_sve_state), + SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl))); +} + #endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h index 0cd0965255d25ba1b5a65b3107ff879931b29632..d81bac256abc3599e2603ce4d15d2c47a3d6cf6a 100644 --- a/arch/arm64/include/asm/kvm_ptrauth.h +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -99,5 +99,26 @@ alternative_else_nop_endif .macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 .endm #endif /* CONFIG_ARM64_PTR_AUTH */ + +#else /* !__ASSEMBLY */ + +#define __ptrauth_save_key(ctxt, key) \ + do { \ + u64 __val; \ + __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ + ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \ + __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ + ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ + } while(0) + +#define ptrauth_save_keys(ctxt) \ + do { \ + __ptrauth_save_key(ctxt, APIA); \ + __ptrauth_save_key(ctxt, APIB); \ + __ptrauth_save_key(ctxt, APDA); \ + __ptrauth_save_key(ctxt, APDB); \ + __ptrauth_save_key(ctxt, APGA); \ + } while(0) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_KVM_PTRAUTH_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index e4944d517c9995a4c5fc21eb0cf261bff8ec0181..99ec236830ffa3b55c7bc75e920534c4dd470f5a 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -183,6 +183,11 @@ */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) +/* + * Hierarchical permission for Stage-1 tables + */ +#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61) + /* * Highest possible physical address supported. */ @@ -277,6 +282,11 @@ #define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_HD (UL(1) << 40) +#define TCR_HPD0_SHIFT 41 +#define TCR_HPD0 (UL(1) << TCR_HPD0_SHIFT) +#define TCR_HPD1_SHIFT 42 +#define TCR_HPD1 (UL(1) << TCR_HPD1_SHIFT) +#define TCR_TBID0 (UL(1) << 51) #define TCR_TBID1 (UL(1) << 52) #define TCR_NFD0 (UL(1) << 53) #define TCR_NFD1 (UL(1) << 54) @@ -284,6 +294,7 @@ #define TCR_E0PD1 (UL(1) << 56) #define TCR_TCMA0 (UL(1) << 57) #define TCR_TCMA1 (UL(1) << 58) +#define TCR_DS (UL(1) << 59) /* * TTBR. diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index b4b2b86237692c853edcab7e7a3abd2d2bfe4529..0e11010219619e3f29f8f77aee52386d87ae155b 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -30,8 +30,8 @@ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) -#define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG) -#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG) +#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) +#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) @@ -71,7 +71,19 @@ extern bool arm64_use_ng_mappings; #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) +#ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false +#define PTE_MAYBE_SHARED PTE_SHARED +#define PMD_MAYBE_SHARED PMD_SECT_S +#else +static inline bool __pure lpa2_is_enabled(void) +{ + return read_tcr() & TCR_DS; +} + +#define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) +#define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) +#endif /* * If we have userspace only BTI we don't want to mark kernel pages diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 2cbcf623f5317ef820439da361af4acb1446a686..eb5dced33ee08e2fa2eb1e99d97740441d1c5d1e 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -109,6 +109,9 @@ #define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) #define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x)) +/* Register-based PAN access, for save/restore purposes */ +#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -315,7 +318,25 @@ #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) #define SYS_PAR_EL1_F BIT(0) +/* When PAR_EL1.F == 1 */ #define SYS_PAR_EL1_FST GENMASK(6, 1) +#define SYS_PAR_EL1_PTW BIT(8) +#define SYS_PAR_EL1_S BIT(9) +#define SYS_PAR_EL1_AssuredOnly BIT(12) +#define SYS_PAR_EL1_TopLevel BIT(13) +#define SYS_PAR_EL1_Overlay BIT(14) +#define SYS_PAR_EL1_DirtyBit BIT(15) +#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48) +#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16)) +#define SYS_PAR_EL1_RES1 BIT(11) +/* When PAR_EL1.F == 0 */ +#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7) +#define SYS_PAR_EL1_NS BIT(9) +#define SYS_PAR_EL1_F0_IMPDEF BIT(10) +#define SYS_PAR_EL1_NSE BIT(11) +#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12) +#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) +#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) /*** Statistical Profiling Extension ***/ #define PMSEVFR_EL1_RES0_IMP \ @@ -446,6 +467,7 @@ #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0) #define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1) +#define SYS_CNTVCT_EL0 sys_reg(3, 3, 14, 0, 2) #define SYS_CNTPCTSS_EL0 sys_reg(3, 3, 14, 0, 5) #define SYS_CNTVCTSS_EL0 sys_reg(3, 3, 14, 0, 6) @@ -453,14 +475,17 @@ #define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) #define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) +#define SYS_CNTV_TVAL_EL0 sys_reg(3, 3, 14, 3, 0) #define SYS_CNTV_CTL_EL0 sys_reg(3, 3, 14, 3, 1) #define SYS_CNTV_CVAL_EL0 sys_reg(3, 3, 14, 3, 2) #define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) #define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) #define SYS_AARCH32_CNTPCT sys_reg(0, 0, 0, 14, 0) +#define SYS_AARCH32_CNTVCT sys_reg(0, 1, 0, 14, 0) #define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) #define SYS_AARCH32_CNTPCTSS sys_reg(0, 8, 0, 14, 0) +#define SYS_AARCH32_CNTVCTSS sys_reg(0, 9, 0, 14, 0) #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) @@ -493,6 +518,10 @@ #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) #define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1) #define SYS_SP_EL1 sys_reg(3, 4, 4, 1, 0) +#define SYS_SPSR_irq sys_reg(3, 4, 4, 3, 0) +#define SYS_SPSR_abt sys_reg(3, 4, 4, 3, 1) +#define SYS_SPSR_und sys_reg(3, 4, 4, 3, 2) +#define SYS_SPSR_fiq sys_reg(3, 4, 4, 3, 3) #define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1) #define SYS_AFSR0_EL2 sys_reg(3, 4, 5, 1, 0) #define SYS_AFSR1_EL2 sys_reg(3, 4, 5, 1, 1) @@ -624,8 +653,26 @@ #define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5) #define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6) #define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7) +#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2) /* TLBI instructions */ +#define TLBI_Op0 1 + +#define TLBI_Op1_EL1 0 /* Accessible from EL1 or higher */ +#define TLBI_Op1_EL2 4 /* Accessible from EL2 or higher */ + +#define TLBI_CRn_XS 8 /* Extra Slow (the common one) */ +#define TLBI_CRn_nXS 9 /* not Extra Slow (which nobody uses)*/ + +#define TLBI_CRm_IPAIS 0 /* S2 Inner-Shareable */ +#define TLBI_CRm_nROS 1 /* non-Range, Outer-Sharable */ +#define TLBI_CRm_RIS 2 /* Range, Inner-Sharable */ +#define TLBI_CRm_nRIS 3 /* non-Range, Inner-Sharable */ +#define TLBI_CRm_IPAONS 4 /* S2 Outer and Non-Shareable */ +#define TLBI_CRm_ROS 5 /* Range, Outer-Sharable */ +#define TLBI_CRm_RNS 6 /* Range, Non-Sharable */ +#define TLBI_CRm_nRNS 7 /* non-Range, Non-Sharable */ + #define OP_TLBI_VMALLE1OS sys_insn(1, 0, 8, 1, 0) #define OP_TLBI_VAE1OS sys_insn(1, 0, 8, 1, 1) #define OP_TLBI_ASIDE1OS sys_insn(1, 0, 8, 1, 2) @@ -856,10 +903,12 @@ /* id_aa64mmfr0 */ #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN 0x0 +#define ID_AA64MMFR0_EL1_TGRAN4_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN 0x1 +#define ID_AA64MMFR0_EL1_TGRAN16_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX 0xf #define ARM64_MIN_PARANGE_BITS 32 @@ -867,6 +916,7 @@ #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2 0x3 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX 0x7 #ifdef CONFIG_ARM64_PA_BITS_52 @@ -877,11 +927,13 @@ #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN4_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN16_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 261d6e9df2e1009f8efab95d90dda611fcba1b3e..ebf4a9f943ed92049111bd0bc71811c1f33d0222 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -82,6 +82,12 @@ bool is_kvm_arm_initialised(void); DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); +static inline bool is_pkvm_initialized(void) +{ + return IS_ENABLED(CONFIG_KVM) && + static_branch_likely(&kvm_protected_mode_initialized); +} + /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) { @@ -89,8 +95,7 @@ static inline bool is_hyp_mode_available(void) * If KVM protected mode is initialized, all CPUs must have been booted * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1. */ - if (IS_ENABLED(CONFIG_KVM) && - static_branch_likely(&kvm_protected_mode_initialized)) + if (is_pkvm_initialized()) return true; return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 && @@ -104,8 +109,7 @@ static inline bool is_hyp_mode_mismatched(void) * If KVM protected mode is initialized, all CPUs must have been booted * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1. */ - if (IS_ENABLED(CONFIG_KVM) && - static_branch_likely(&kvm_protected_mode_initialized)) + if (is_pkvm_initialized()) return false; return __boot_cpu_mode[0] != __boot_cpu_mode[1]; diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index df2c47c559728b5f57ced959a38de2e09ec37854..9e593bb60975042c935f8b81bdbd5d997a5e367d 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -50,7 +50,6 @@ #define VNCR_VBAR_EL1 0x250 #define VNCR_TCR2_EL1 0x270 #define VNCR_PIRE0_EL1 0x290 -#define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 #define VNCR_ICH_LR0_EL2 0x400 #define VNCR_ICH_LR1_EL2 0x408 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 974a6a4cdc020ca5fbf46caf508f482b1ce2e8d0..6558e0cb98c568532520f68dbebfafe474771cce 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -109,6 +109,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_HAS_EL2_E2H0 8 /* Limit NV support to E2H RES0 */ struct kvm_vcpu_init { __u32 target; diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 463b48d0f92500191f4810974a0f3cffb04e26ed..2a37875260c27cfe795068f1851f0c443decdb07 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -803,6 +803,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif + { + .desc = "Broken CNTVOFF_EL2", + .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, + ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) { + MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1), + {} + })), + }, { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8aa173718c89e212f386df46efab8b334fd97d15..2796ce3090d1993bcb547d020bb5d3c3f1385f34 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -231,6 +231,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), @@ -480,6 +481,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -1902,10 +1904,44 @@ static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) {} }; - return !(has_cpuid_feature(entry, scope) || - is_midr_in_range_list(read_cpuid_id(), nv1_ni_list)); + return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && + !(has_cpuid_feature(entry, scope) || + is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); } +#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) +static bool has_lpa2_at_stage1(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; +} + +static bool has_lpa2_at_stage2(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; +} + +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); +} +#else +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + return false; +} +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) @@ -2164,7 +2200,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, if (kvm_get_mode() != KVM_MODE_NV) return false; - if (!has_cpuid_feature(cap, scope)) { + if (!cpucap_multi_entry_cap_matches(cap, scope)) { pr_warn("unavailable: %s\n", cap->desc); return false; } @@ -2540,7 +2576,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, - ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + .match_list = (const struct arm64_cpu_capabilities []){ + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + }, + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) + }, + { /* Sentinel */ } + }, }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, @@ -2942,6 +2988,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) }, + { + .desc = "52-bit Virtual Addressing for KVM (LPA2)", + .capability = ARM64_HAS_LPA2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_lpa2, + }, { .desc = "FPMR", .type = ARM64_CPUCAP_SYSTEM_FEATURE, diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index e32c8dd0b17a77a6980072c7064c76b2811ff8ed..e0e710b36da378941e3feb3e7df58de7e0185c7f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -576,6 +576,21 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) isb 0: mov_q x0, HCR_HOST_NVHE_FLAGS + + /* + * Compliant CPUs advertise their VHE-onlyness with + * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be + * RES1 in that case. Publish the E2H bit early so that + * it can be picked up by the init_el2_state macro. + * + * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but + * don't advertise it (they predate this relaxation). + */ + mrs_s x1, SYS_ID_AA64MMFR4_EL1 + tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f + + orr x0, x0, #HCR_E2H +1: msr hcr_el2, x0 isb @@ -588,22 +603,10 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x1, INIT_SCTLR_EL1_MMU_OFF - /* - * Compliant CPUs advertise their VHE-onlyness with - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be - * RES1 in that case. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, but - * don't advertise it (they predate this relaxation). - */ - mrs_s x0, SYS_ID_AA64MMFR4_EL1 - ubfx x0, x0, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH - tbnz x0, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f - mrs x0, hcr_el2 and x0, x0, #HCR_E2H cbz x0, 2f -1: + /* Set a sane SCTLR_EL1, the VHE way */ pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 35f3c795951373549faad3ed2d1bd30ad427eb78..2eb4203f1425e4f34b0bb1f4975b9ac3bbdbc14f 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -73,6 +73,9 @@ KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); KVM_NVHE_ALIAS(gic_nonsecure_priorities); #endif +/* Static key which is set if CNTVOFF_EL2 is unusable */ +KVM_NVHE_ALIAS(broken_cntvoff_key); + /* EL2 exception handling */ KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 45446916c6b3d303309842a9d98f4a0441a62b7c..4e4185ddfa7e63894c460ba830c2b6bd848a4750 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ if VIRTUALIZATION menuconfig KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + depends on AS_HAS_ARMV8_4 select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 5fd2365591049cd4327526d66bcb4b0f55812768..a4df40d32cd103e5b7031aabe48e69cba779f889 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,7 +13,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o pvsched.o \ inject_fault.o handle_exit.o pv_cpufreq.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o \ - arch_timer.o trng.o vmid.o emulate-nested.o nested.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ @@ -24,6 +24,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o pvsched.o \ obj-y += kvm_host.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o +kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o ifdef CONFIG_KVM_ARM_HOST_VHE_ONLY ccflags-y += -I $(srctree)/$(src)/hyp/include diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 77edf709d59818471b5c64e5166ae42f3009a7c8..61bc49ab1a713c9a0b3f4c7ee7b18dc3519be4ac 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -30,6 +30,7 @@ static u32 host_vtimer_irq_flags; static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); +DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); static const u8 default_ppi[] = { [TIMER_PTIMER] = 30, @@ -101,21 +102,6 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) } } -static u64 timer_get_offset(struct arch_timer_context *ctxt) -{ - u64 offset = 0; - - if (!ctxt) - return 0; - - if (ctxt->offset.vm_offset) - offset += *ctxt->offset.vm_offset; - if (ctxt->offset.vcpu_offset) - offset += *ctxt->offset.vcpu_offset; - - return offset; -} - static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { struct kvm_vcpu *vcpu = ctxt->vcpu; @@ -441,11 +427,30 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } +static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) +{ + /* + * Paper over NV2 brokenness by publishing the interrupt status + * bit. This still results in a poor quality of emulation (guest + * writes will have no effect until the next exit). + * + * But hey, it's fast, right? + */ + if (is_hyp_ctxt(ctx->vcpu) && + (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) { + unsigned long val = timer_get_ctl(ctx); + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); + timer_set_ctl(ctx, val); + } +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { int ret; + kvm_timer_update_status(timer_ctx, new_level); + timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); @@ -469,6 +474,8 @@ static void timer_emulate(struct arch_timer_context *ctx) if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); + kvm_timer_update_status(ctx, should_fire); + /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, @@ -511,7 +518,12 @@ static void timer_save_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL)); + cval = read_sysreg_el0(SYS_CNTV_CVAL); + + if (has_broken_cntvoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTV_CTL); @@ -616,8 +628,15 @@ static void timer_restore_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: - set_cntvoff(timer_get_offset(ctx)); - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + if (has_broken_cntvoff()) { + set_cntvoff(0); + cval += offset; + } else { + set_cntvoff(offset); + } + write_sysreg_el0(cval, SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; @@ -760,7 +779,7 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) { - bool tpt, tpc; + bool tvt, tpt, tvc, tpc, tvt02, tpt02; u64 clr, set; /* @@ -775,7 +794,29 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) * within this function, reality kicks in and we start adding * traps based on emulation requirements. */ - tpt = tpc = false; + tvt = tpt = tvc = tpc = false; + tvt02 = tpt02 = false; + + /* + * NV2 badly breaks the timer semantics by redirecting accesses to + * the EL1 timer state to memory, so let's call ECV to the rescue if + * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. + * + * The treatment slightly varies depending whether we run a nVHE or + * VHE guest: nVHE will use the _EL0 registers directly, while VHE + * will use the _EL02 accessors. This translates in different trap + * bits. + * + * None of the trapping is required when running in non-HYP context, + * unless required by the L1 hypervisor settings once we advertise + * ECV+NV in the guest, or that we need trapping for other reasons. + */ + if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { + if (vcpu_el2_e2h_is_set(vcpu)) + tvt02 = tpt02 = true; + else + tvt = tpt = true; + } /* * We have two possibility to deal with a physical offset: @@ -790,10 +831,21 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) if (!has_cntpoff() && timer_get_offset(map->direct_ptimer)) tpt = tpc = true; + /* + * For the poor sods that could not correctly substract one value + * from another, trap the full virtual timer and counter. + */ + if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) + tvt = tvc = true; + /* * Apply the enable bits that the guest hypervisor has requested for * its own guest. We can only add traps that wouldn't have been set * above. + * Implementation choices: we do not support NV when E2H=0 in the + * guest, and we don't support configuration where E2H is writable + * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but + * not both). This simplifies the handling of the EL1NV* bits. */ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); @@ -804,6 +856,9 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + + tpt02 |= (val & CNTHCTL_EL1NVPCT); + tvt02 |= (val & CNTHCTL_EL1NVVCT); } /* @@ -815,6 +870,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); + assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); + assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); + assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ sysreg_clear_set(cnthctl_el2, clr, set); @@ -903,6 +962,44 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) kvm_timer_blocking(vcpu); } +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) +{ + /* + * When NV2 is on, guest hypervisors have their EL1 timer register + * accesses redirected to the VNCR page. Any guest action taken on + * the timer is postponed until the next exit, leading to a very + * poor quality of emulation. + * + * This is an unmitigated disaster, only papered over by FEAT_ECV, + * which allows trapping of the timer registers even with NV2. + * Still, this is still worse than FEAT_NV on its own. Meh. + */ + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { + /* + * For a VHE guest hypervisor, the EL2 state is directly + * stored in the host EL1 timers, while the emulated EL1 + * state is stored in the VNCR page. The latter could have + * been updated behind our back, and we must reset the + * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. + */ + struct timer_map map; + get_timer_map(vcpu, &map); + + soft_timer_cancel(&map.emul_vtimer->hrtimer); + soft_timer_cancel(&map.emul_ptimer->hrtimer); + timer_emulate(map.emul_vtimer); + timer_emulate(map.emul_ptimer); + } +} + /* * With a userspace irqchip we have to check if the guest de-asserted the * timer and if so, unmask the timer irq signal on the host interrupt @@ -932,7 +1029,7 @@ void kvm_timer_sync_user(struct kvm_vcpu *vcpu) unmask_vtimer_irq_user(vcpu); } -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; @@ -976,8 +1073,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); - - return 0; } static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) @@ -1363,6 +1458,37 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } +static void kvm_timer_handle_errata(void) +{ + u64 mmfr0, mmfr1, mmfr4; + + /* + * CNTVOFF_EL2 is broken on some implementations. For those, we trap + * all virtual timer/counter accesses, requiring FEAT_ECV. + * + * However, a hypervisor supporting nesting is likely to mitigate the + * erratum at L0, and not require other levels to mitigate it (which + * would otherwise be a terrible performance sink due to trap + * amplification). + * + * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, + * and that NV is likely not to (because of limitations of the + * architecture), only enable the workaround when FEAT_VHE and + * FEAT_E2H0 are both detected. Time will tell if this actually holds. + */ + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); + if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && + !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && + SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && + (has_vhe() || has_hvhe()) && + cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { + static_branch_enable(&broken_cntvoff_key); + kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); + } +} + int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; @@ -1431,6 +1557,7 @@ int __init kvm_timer_hyp_init(bool has_gic) goto out_free_vtimer_irq; } + kvm_timer_handle_errata(); return 0; out_free_ptimer_irq: diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5f1d4dc497eafc547e75ae9b820a7ce5d25647bb..4882f4dc12fe3c8a26c9ac5fa812567f4fe0c60d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -35,16 +35,19 @@ #include #include #include +#include #include #include #include -#include +#include #include #include #include #include +#include "sys_regs.h" + #ifdef MODULE MODULE_IMPORT_NS(KVM); #endif @@ -73,12 +76,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { - int r; - u64 new_cap; + int r = -EINVAL; if (cap->flags) return -EINVAL; + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) + return -EINVAL; + switch (cap->cap) { case KVM_CAP_ARM_NISV_TO_USER: r = 0; @@ -87,9 +92,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; case KVM_CAP_ARM_MTE: mutex_lock(&kvm->lock); - if (!system_supports_mte() || kvm->created_vcpus) { - r = -EINVAL; - } else { + if (system_supports_mte() && !kvm->created_vcpus) { r = 0; set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); } @@ -100,25 +103,22 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: - new_cap = cap->args[0]; - mutex_lock(&kvm->slots_lock); /* * To keep things simple, allow changing the chunk * size only when no memory slots have been created. */ - if (!kvm_are_all_memslots_empty(kvm)) { - r = -EINVAL; - } else if (new_cap && !kvm_is_block_size_supported(new_cap)) { - r = -EINVAL; - } else { - r = 0; - kvm->arch.mmu.split_page_chunk_size = new_cap; + if (kvm_are_all_memslots_empty(kvm)) { + u64 new_cap = cap->args[0]; + + if (!new_cap || kvm_is_block_size_supported(new_cap)) { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } } mutex_unlock(&kvm->slots_lock); break; default: - r = -EINVAL; break; } @@ -148,6 +148,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_unlock(&kvm->lock); #endif + kvm_init_nested(kvm); + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; @@ -238,9 +240,47 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_arm_teardown_hypercalls(kvm); } +static bool kvm_has_full_ptr_auth(void) +{ + bool apa, gpa, api, gpi, apa3, gpa3; + u64 isar1, isar2, val; + + /* + * Check that: + * + * - both Address and Generic auth are implemented for a given + * algorithm (Q5, IMPDEF or Q3) + * - only a single algorithm is implemented. + */ + if (!system_has_full_ptr_auth()) + return false; + + isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1); + gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP); + + api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1); + val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1); + gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP); + + apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2); + val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2); + gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP); + + return (apa == gpa && api == gpi && apa3 == gpa3 && + (apa + api + apa3) == 1); +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; + + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) + return 0; + switch (ext) { case KVM_CAP_IRQCHIP: r = vgic_present; @@ -332,7 +372,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: - r = system_has_full_ptr_auth(); + r = kvm_has_full_ptr_auth(); break; case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: if (kvm) @@ -403,7 +443,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Force users to call KVM_ARM_VCPU_INIT */ vcpu_clear_flag(vcpu, VCPU_INITIALIZED); - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; @@ -461,11 +500,52 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) } +static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) +{ + if (vcpu_has_ptrauth(vcpu)) { + /* + * Either we're running running an L2 guest, and the API/APK + * bits come from L1's HCR_EL2, or API/APK are both set. + */ + if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) { + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + val &= (HCR_API | HCR_APK); + vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); + vcpu->arch.hcr_el2 |= val; + } else { + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + } + + /* + * Save the host keys if there is any chance for the guest + * to use pauth, as the entry code will reload the guest + * keys in that case. + * Protected mode is the exception to that rule, as the + * entry into the EL2 code eagerly switch back and forth + * between host and hyp keys (and kvm_hyp_ctxt is out of + * reach anyway). + */ + if (is_protected_kvm_enabled()) + return; + + if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { + struct kvm_cpu_context *ctxt; + ctxt = this_cpu_ptr_hyp_sym_wrapper(kvm_hyp_ctxt); + ptrauth_save_keys(ctxt); + } + } +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_s2_mmu *mmu; int *last_ran; + if (vcpu_has_nv(vcpu)) + kvm_vcpu_load_hw_mmu(vcpu); + mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); @@ -509,8 +589,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) else vcpu_set_wfx_traps(vcpu); - if (vcpu_has_ptrauth(vcpu)) - vcpu_ptrauth_disable(vcpu); + vcpu_set_pauth_traps(vcpu); if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); @@ -527,6 +606,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_timer_vcpu_put(vcpu); kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu); + if (vcpu_has_nv(vcpu)) + kvm_vcpu_put_hw_mmu(vcpu); kvm_arm_vmid_clear_active(); vcpu_clear_on_unsupported_cpu(vcpu); @@ -634,11 +715,6 @@ unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) } #endif -static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) -{ - return vcpu_get_flag(vcpu, VCPU_INITIALIZED); -} - static void kvm_init_mpidr_data(struct kvm *kvm) { struct kvm_mpidr_data *data = NULL; @@ -728,17 +804,15 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - if (vcpu_has_nv(vcpu)) { - ret = kvm_init_nv_sysregs(vcpu->kvm); - if (ret) - return ret; - } + ret = kvm_finalize_sys_regs(vcpu); + if (ret) + return ret; /* - * This needs to happen after NV has imposed its own restrictions on - * the feature set + * This needs to happen after any restriction has been applied + * to the feature set. */ - kvm_init_sysreg(vcpu); + kvm_calculate_traps(vcpu); ret = kvm_timer_enable(vcpu); if (ret) @@ -754,14 +828,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - /* - * Initialize traps for protected VMs. - * NOTE: Move to run in EL2 directly, rather than via a hypercall, once - * the code is in place for first run initialization at EL2. - */ - if (kvm_vm_is_protected(kvm)) - kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); @@ -837,9 +903,8 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) * doorbells to be signalled, should an interrupt become pending. */ preempt_disable(); - kvm_vgic_vmcr_sync(vcpu); vcpu_set_flag(vcpu, IN_WFI); - vgic_v4_put(vcpu); + kvm_vgic_put(vcpu); preempt_enable(); kvm_vcpu_halt(vcpu); @@ -847,7 +912,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_disable(); vcpu_clear_flag(vcpu, IN_WFI); - vgic_v4_load(vcpu); + kvm_vgic_load(vcpu); preempt_enable(); } @@ -933,6 +998,8 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_dirty_ring_check_request(vcpu)) return 0; + + check_nested_vcpu_requests(vcpu); } return 1; @@ -1133,6 +1200,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); + if (is_hyp_ctxt(vcpu)) + kvm_timer_sync_nested(vcpu); + kvm_arch_vcpu_ctxsync_fp(vcpu); /* @@ -1295,6 +1365,30 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } +static unsigned long system_supported_vcpu_features(void) +{ + unsigned long features = KVM_VCPU_VALID_FEATURES; + + if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) + clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); + + if (!kvm_arm_support_pmu_v3()) + clear_bit(KVM_ARM_VCPU_PMU_V3, &features); + + if (!system_supports_sve()) + clear_bit(KVM_ARM_VCPU_SVE, &features); + + if (!kvm_has_full_ptr_auth()) { + clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features); + clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features); + } + + if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) + clear_bit(KVM_ARM_VCPU_HAS_EL2, &features); + + return features; +} + static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { @@ -1309,12 +1403,25 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, return -ENOENT; } - if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) - return 0; + if (features & ~system_supported_vcpu_features()) + return -EINVAL; + + /* + * For now make sure that both address/generic pointer authentication + * features are requested by the userspace together. + */ + if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) != + test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features)) + return -EINVAL; - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) + /* Disallow NV+SVE for the time being */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features) && + test_bit(KVM_ARM_VCPU_SVE, &features)) return -EINVAL; + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) + return 0; + /* MTE is incompatible with AArch32 */ if (kvm_has_mte(vcpu->kvm)) return -EINVAL; @@ -1331,7 +1438,8 @@ static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, { unsigned long features = init->features[0]; - return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); + return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features, + KVM_VCPU_MAX_FEATURES); } static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) @@ -1346,6 +1454,10 @@ static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) ret = kvm_arm_set_default_pmu(kvm); + /* Prepare for nested if required */ + if (!ret && vcpu_has_nv(vcpu)) + ret = kvm_vcpu_init_nested(vcpu); + return ret; } @@ -1359,25 +1471,21 @@ static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, mutex_lock(&kvm->arch.config_lock); if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) && - !bitmap_equal(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES)) + kvm_vcpu_init_changed(vcpu, init)) goto out_unlock; - bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); + bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); ret = kvm_setup_vcpu(vcpu); if (ret) goto out_unlock; /* Now we know what it is, we can reset it. */ - ret = kvm_reset_vcpu(vcpu); - if (ret) { - bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); - goto out_unlock; - } + kvm_reset_vcpu(vcpu); - bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags); vcpu_set_flag(vcpu, VCPU_INITIALIZED); + ret = 0; out_unlock: mutex_unlock(&kvm->arch.config_lock); return ret; @@ -1402,7 +1510,8 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, if (kvm_vcpu_init_changed(vcpu, init)) return -EINVAL; - return kvm_reset_vcpu(vcpu); + kvm_reset_vcpu(vcpu); + return 0; } static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, @@ -1881,6 +1990,12 @@ static unsigned long nvhe_percpu_order(void) return size ? get_order(size) : 0; } + +static size_t pkvm_host_sve_state_order(void) +{ + return get_order(pkvm_host_sve_state_size()); +} + #endif /* A lookup table holding the hypervisor VA for each vector slot */ @@ -1937,6 +2052,7 @@ static void cpu_hyp_init_context(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); + u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); unsigned long tcr; /* @@ -1959,6 +2075,10 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) } tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); + tcr &= ~TCR_EL2_PS_MASK; + tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0)); + if (kvm_lpa2_is_enabled()) + tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); @@ -2295,14 +2415,30 @@ static int __init init_hyp_mode(void) kvm_err("init hyp mode is not allowed\n"); return -EPERM; } + +static void finalize_init_hyp_mode(void) +{ +} #else static void __init teardown_hyp_mode(void) { + bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); int cpu; free_hyp_pgds(); for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + + if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]) + continue; + + if (free_sve) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); + } + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); } } @@ -2388,6 +2524,50 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } +static int init_pkvm_host_sve_state(void) +{ + int cpu; + + if (!system_supports_sve()) + return 0; + + /* Allocate pages for host sve state in protected mode. */ + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); + + if (!page) + return -ENOMEM; + + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); + } + + /* + * Don't map the pages in hyp since these are only used in protected + * mode, which will (re)create its own mapping when initialized. + */ + + return 0; +} + +/* + * Finalizes the initialization of hyp mode, once everything else is initialized + * and the initialziation process cannot fail. + */ +static void finalize_init_hyp_mode(void) +{ + int cpu; + + if (system_supports_sve() && is_protected_kvm_enabled()) { + for_each_possible_cpu(cpu) { + struct cpu_sve_state *sve_state; + + sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; + per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = + kern_hyp_va(sve_state); + } + } +} + static void pkvm_hyp_init_ptrauth(void) { struct kvm_cpu_context *hyp_ctxt; @@ -2556,6 +2736,10 @@ static int __init init_hyp_mode(void) goto out_err; } + err = init_pkvm_host_sve_state(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); @@ -2718,13 +2902,12 @@ static __init int kvm_arm_init(void) if (err) goto out_hyp; - if (is_protected_kvm_enabled()) { - kvm_info("Protected nVHE mode initialized successfully\n"); - } else if (in_hyp_mode) { - kvm_info("VHE mode initialized successfully\n"); - } else { - kvm_info("Hyp mode initialized successfully\n"); - } + kvm_info("%s%sVHE%s mode initialized successfully\n", + in_hyp_mode ? "" : (is_protected_kvm_enabled() ? + "Protected " : "Hyp "), + in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? + "h" : "n"), + cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": ""); /* * FIXME: Do something reasonable if kvm_init() fails after pKVM @@ -2734,6 +2917,13 @@ static __init int kvm_arm_init(void) if (err) goto out_subs; + /* + * This should be called after initialization is done and failure isn't + * possible anymore. + */ + if (!in_hyp_mode) + finalize_init_hyp_mode(); + kvm_arm_initialised = true; return 0; diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 0000000000000000000000000000000000000000..a751f6bc49bf4fea2bd89978b9d881f952b64483 --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1301 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim + */ + +#include + +#include +#include +#include + +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool hpd; + bool pan; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + bool ur; + bool uw; + bool ux; + bool pr; + bool pw; + bool px; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +{ + wr->fst = fst; + wr->ptw = ptw; + wr->s2 = s2; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + break; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + return false; + + switch (regime) { + case TR_EL2: + case TR_EL20: + return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; + case TR_EL10: + return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && + (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); + default: + BUG(); + } +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva, as_el0; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + wi->regime = compute_translation_regime(vcpu, op); + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + va55 = va & BIT(55); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault_l0; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault_l0; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_16K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault_l0; + + ia_bits = get_ia_size(wi); + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault_l0; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault_l0; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault_l0; + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + return 0; + +addrsz: /* Address Size Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + return -EFAULT; + +transfault_l0: /* Translation Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + return -EFAULT; +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index, ipa; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true, true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true, true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), + true, false); + return ret; + } + + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc & GENMASK_ULL(47, wi->pgshift); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2; + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2; + break; + } + + if (!valid_block) + goto transfault; + } + + if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + goto addrsz; + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = desc & GENMASK(47, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; + u64 sctlr; + u64 vttbr; + u64 vtcr; + u64 hcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + } + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); + config->hcr = read_sysreg(hcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + write_sysreg(config->hcr, hcr_el2); + + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + } + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_sh(u8 attr, u64 desc) +{ + u8 sh; + + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + sh = FIELD_GET(PTE_SHARED, desc); + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_sh(final_attr, tr->desc))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, + enum trans_regime regime) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & GENMASK_ULL(47, 12); + + if (regime == TR_EL10 && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & GENMASK_ULL(47, 12); + + sh = compute_sh(mair, wr->desc); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (s1pie_enabled(vcpu, regime)) + return true; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { + case 0b00: + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; + break; + case 0b01: + wr->pr = wr->pw = wr->ur = wr->uw = true; + break; + case 0b10: + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; + break; + case 0b11: + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { + case 0b00: + break; + case 0b01: + wr->ur = wr->uw = false; + break; + case 0b10: + wr->pw = wr->uw = false; + break; + case 0b11: + wr->pw = wr->ur = wr->uw = false; + break; + } + + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} + +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; + + idx = pte_pi_index(wr->desc); + + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } + + set_perms(priv, wr, pp); + + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool pan; + + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; +} + +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + bool perm_fail = false; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + compute_s1_permissions(vcpu, &wi, &wr); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !wr.pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !wr.pw; + break; + case OP_AT_S1E0R: + perm_fail = !wr.ur; + break; + case OP_AT_S1E0W: + perm_fail = !wr.uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + +compute_par: + return compute_par_s1(vcpu, &wr, wi.regime); +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + } + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Clear TGE, enable S2 translation, we're rolling */ + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + + /* + * If PAR_EL1 reports that AT failed on a S1 permission fault, we + * know for sure that the PTW was able to walk the S1 tables and + * there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + struct kvm_s2_mmu *mmu; + u64 val, hcr; + bool fail; + + mmu = &vcpu->kvm->arch.mmu; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg(val, hcr_el2); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg(hcr, hcr_el2); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return; + + /* + * If we only have a single stage of translation (EL2&0), exit + * early. Same thing if {VM,DC}=={0,0}. + */ + if (compute_translation_regime(vcpu, op) == TR_EL20 || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 838b401c422059915f98fea02ff40a140b7dd2c3..f517027ab6566535391a0593643c6c31c012652d 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -16,9 +16,13 @@ enum trap_behaviour { BEHAVE_HANDLE_LOCALLY = 0, + BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), - BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + + /* Traps that take effect in Host EL0, this is rare! */ + BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), }; struct trap_bits { @@ -79,25 +83,43 @@ enum cgt_group_id { CGT_MDCR_E2TB, CGT_MDCR_TDCC, + CGT_CPACR_E0POE, + CGT_CPTR_TAM, + CGT_CPTR_TCPAC, + + CGT_HCRX_TCR2En, + + CGT_CNTHCTL_EL1TVT, + CGT_CNTHCTL_EL1TVCT, + + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, - CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, CGT_HCR_TVM_TRVM, + CGT_HCR_TVM_TRVM_HCRX_TCR2En, CGT_HCR_TPU_TICAB, CGT_HCR_TPU_TOCU, CGT_HCR_NV1_nNV2_ENSCXT, CGT_MDCR_TPM_TPMCR, + CGT_MDCR_TPM_HPMN, CGT_MDCR_TDE_TDA, CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, + CGT_ICH_HCR_TC_TDIR, + /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. @@ -105,6 +127,11 @@ enum cgt_group_id { __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PTEN, + CGT_CNTHCTL_EL1NVPCT, + CGT_CNTHCTL_EL1NVVCT, + + CGT_CPTR_TTA, + CGT_MDCR_HPMN, /* Must be last */ __NR_CGT_GROUP_IDS__ @@ -121,7 +148,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TID2, .mask = HCR_TID2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID3] = { .index = HCR_EL2, @@ -145,37 +172,37 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TIDCP, .mask = HCR_TIDCP, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TACR] = { .index = HCR_EL2, .value = HCR_TACR, .mask = HCR_TACR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TSW] = { .index = HCR_EL2, .value = HCR_TSW, .mask = HCR_TSW, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ .index = HCR_EL2, .value = HCR_TPC, .mask = HCR_TPC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPU] = { .index = HCR_EL2, .value = HCR_TPU, .mask = HCR_TPU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLB] = { .index = HCR_EL2, .value = HCR_TTLB, .mask = HCR_TTLB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TVM] = { .index = HCR_EL2, @@ -187,7 +214,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TDZ, .mask = HCR_TDZ, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TRVM] = { .index = HCR_EL2, @@ -199,151 +226,213 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TLOR, .mask = HCR_TLOR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TERR] = { .index = HCR_EL2, .value = HCR_TERR, .mask = HCR_TERR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_APK] = { .index = HCR_EL2, .value = 0, .mask = HCR_APK, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV_nNV2] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV1_nNV2] = { .index = HCR_EL2, .value = HCR_NV | HCR_NV1, .mask = HCR_NV | HCR_NV1 | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_AT] = { .index = HCR_EL2, .value = HCR_AT, .mask = HCR_AT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_nFIEN] = { .index = HCR_EL2, .value = 0, .mask = HCR_FIEN, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID4] = { .index = HCR_EL2, .value = HCR_TID4, .mask = HCR_TID4, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TICAB] = { .index = HCR_EL2, .value = HCR_TICAB, .mask = HCR_TICAB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TOCU] = { .index = HCR_EL2, .value = HCR_TOCU, .mask = HCR_TOCU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_ENSCXT] = { .index = HCR_EL2, .value = 0, .mask = HCR_ENSCXT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBIS] = { .index = HCR_EL2, .value = HCR_TTLBIS, .mask = HCR_TTLBIS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBOS] = { .index = HCR_EL2, .value = HCR_TTLBOS, .mask = HCR_TTLBOS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, .value = MDCR_EL2_TDE, .mask = MDCR_EL2_TDE, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDA, .mask = MDCR_EL2_TDA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDOSA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDOSA, .mask = MDCR_EL2_TDOSA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDRA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDRA, .mask = MDCR_EL2_TDRA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2PB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2PB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMS] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMS, .mask = MDCR_EL2_TPMS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TTRF] = { .index = MDCR_EL2, .value = MDCR_EL2_TTRF, .mask = MDCR_EL2_TTRF, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2TB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2TB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDCC] = { .index = MDCR_EL2, .value = MDCR_EL2_TDCC, .mask = MDCR_EL2_TDCC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_HCRX_TCR2En] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_TCR2En, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPACR_E0POE] = { + .index = CPTR_EL2, + .value = CPACR_ELx_E0POE, + .mask = CPACR_ELx_E0POE, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TAM] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TAM, + .mask = CPTR_EL2_TAM, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CPTR_TCPAC] = { + .index = CPTR_EL2, + .value = CPTR_EL2_TCPAC, + .mask = CPTR_EL2_TCPAC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVT, + .mask = CNTHCTL_EL1TVT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVCT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVCT, + .mask = CNTHCTL_EL1TVCT, + .behaviour = BEHAVE_FORWARD_READ, + }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TC, + .mask = ICH_HCR_TC, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL0, + .mask = ICH_HCR_TALL0, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL1, + .mask = ICH_HCR_TALL1, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TDIR, + .mask = ICH_HCR_TDIR, + .behaviour = BEHAVE_FORWARD_RW, }, }; @@ -354,19 +443,24 @@ static const struct trap_bits coarse_trap_bits[] = { } static const enum cgt_group_id *coarse_control_combo[] = { - MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO), MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM), + MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En, + CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En), MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB), MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), + MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); @@ -399,7 +493,7 @@ static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) @@ -407,7 +501,74 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; +} + +static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) +{ + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); +} + +static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); + + if (!vcpu_el2_e2h_is_set(vcpu)) + val = translate_cptr_el2_to_cpacr_el1(val); + + if (val & CPACR_ELx_TTA) + return BEHAVE_FORWARD_RW; + + return BEHAVE_HANDLE_LOCALLY; +} + +static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + unsigned int idx; + + + switch (sysreg) { + case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); + break; + case SYS_PMXEVTYPER_EL0: + case SYS_PMXEVCNTR_EL0: + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + break; + default: + /* Someone used this trap helper for something else... */ + KVM_BUG_ON(1, vcpu->kvm); + return BEHAVE_HANDLE_LOCALLY; + } + + if (kvm_pmu_counter_is_hyp(vcpu, idx)) + return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; + + return BEHAVE_HANDLE_LOCALLY; } #define CCC(id, fn) \ @@ -416,6 +577,10 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), + CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), + CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), + CCC(CGT_CPTR_TTA, check_cptr_tta), + CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; /* @@ -487,9 +652,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), - SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), @@ -622,6 +787,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ), @@ -651,26 +817,96 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_APGAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APGAKEYHI_EL1, CGT_HCR_APK), /* All _EL2 registers */ - SR_RANGE_TRAP(sys_reg(3, 4, 0, 0, 0), - sys_reg(3, 4, 3, 15, 7), CGT_HCR_NV), + SR_TRAP(SYS_BRBCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VMPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCTLR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ACTLR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCTLR2_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_HCR_EL2, + SYS_HCRX_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SMPRIMAP_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SMCR_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_TTBR0_EL2, + SYS_TCR2_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VTTBR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VTCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VNCR_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_HDFGRTR_EL2, + SYS_HAFGRTR_EL2, CGT_HCR_NV), /* Skip the SP_EL1 encoding... */ SR_TRAP(SYS_SPSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ELR_EL2, CGT_HCR_NV), - SR_RANGE_TRAP(sys_reg(3, 4, 4, 1, 1), - sys_reg(3, 4, 10, 15, 7), CGT_HCR_NV), - SR_RANGE_TRAP(sys_reg(3, 4, 12, 0, 0), - sys_reg(3, 4, 14, 15, 7), CGT_HCR_NV), - /* All _EL02, _EL12 registers */ + /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */ + SR_TRAP(SYS_AFSR0_EL2, CGT_HCR_NV), + SR_TRAP(SYS_AFSR1_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ESR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VSESR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_TFSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_FAR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_HPFAR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_PMSCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MAIR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_AMAIR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAMHCR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAMVPMV_EL2, CGT_HCR_NV), + SR_TRAP(SYS_MPAM2_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_MPAMVPM0_EL2, + SYS_MPAMVPM7_EL2, CGT_HCR_NV), + /* + * Note that the spec. describes a group of MEC registers + * whose access should not trap, therefore skip the following: + * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2, + * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2, + * VMECID_P_EL2. + */ + SR_RANGE_TRAP(SYS_VBAR_EL2, + SYS_RMR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_VDISR_EL2, CGT_HCR_NV), + /* ICH_AP0R_EL2 */ + SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2, + SYS_ICH_AP0R3_EL2, CGT_HCR_NV), + /* ICH_AP1R_EL2 */ + SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2, + SYS_ICH_AP1R3_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICC_SRE_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_ICH_HCR_EL2, + SYS_ICH_EISR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICH_ELRSR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_ICH_VMCR_EL2, CGT_HCR_NV), + /* ICH_LR_EL2 */ + SR_RANGE_TRAP(SYS_ICH_LR0_EL2, + SYS_ICH_LR15_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CONTEXTIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_TPIDR_EL2, CGT_HCR_NV), + SR_TRAP(SYS_SCXTNUM_EL2, CGT_HCR_NV), + /* AMEVCNTVOFF0_EL2, AMEVCNTVOFF1_EL2 */ + SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0), + SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV), + /* CNT*_EL2 */ + SR_TRAP(SYS_CNTVOFF_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CNTPOFF_EL2, CGT_HCR_NV), + SR_TRAP(SYS_CNTHCTL_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2, + SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), + SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, + SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), + /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), - sys_reg(3, 5, 14, 15, 7), CGT_HCR_NV), + sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), + SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), @@ -752,6 +988,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), @@ -762,77 +999,77 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM), + SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), @@ -935,11 +1172,97 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB), + SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC), + SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), + SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), + SR_TRAP(SYS_POR_EL0, CGT_CPACR_E0POE), + /* op0=2, op1=1, and CRn<0b1000 */ + SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), + sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); @@ -1006,6 +1329,7 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1), SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1), SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1), + SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1), SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1), SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1), SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1), @@ -1781,7 +2105,8 @@ int __init populate_nv_trap_config(void) cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) { - if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && + cgids[i] < __COMPLEX_CONDITIONS__) { kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); ret = -EINVAL; } @@ -1883,14 +2208,22 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) masks = kvm->arch.sysreg_masks; - return masks->mask[sr - __VNCR_START__].res0; + return masks->mask[sr - __SANITISED_REG_START__].res0; } -static bool check_fgt_bit(struct kvm *kvm, bool is_read, +static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, u64 val, const union trap_config tc) { + struct kvm *kvm = vcpu->kvm; enum vcpu_sysreg sr; + /* + * KVM doesn't know about any FGTs that apply to the host, and hopefully + * that'll remain the case. + */ + if (is_hyp_ctxt(vcpu)) + return false; + if (tc.pol) return (val & BIT(tc.bit)); @@ -1967,7 +2300,15 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) * If we're not nesting, immediately return to the caller, with the * sysreg index, should we have it. */ - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + if (!vcpu_has_nv(vcpu)) + goto local; + + /* + * There are a few traps that take effect InHost, but are constrained + * to EL0. Don't bother with computing the trap behaviour if the vCPU + * isn't in EL0. + */ + if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) goto local; switch ((enum fgt_group_id)tc.fgt) { @@ -2013,12 +2354,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) goto local; } - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read, - val, tc)) + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); + if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) + goto local; + if (((b & BEHAVE_FORWARD_READ) && is_read) || ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; @@ -2052,6 +2395,26 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) return true; } +static bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + bool control_bit_set; + + if (!vcpu_has_nv(vcpu)) + return false; + + control_bit_set = __vcpu_sys_reg(vcpu, HCR_EL2) & control_bit; + if (!is_hyp_ctxt(vcpu) && control_bit_set) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; + } + return false; +} + +bool forward_smc_trap(struct kvm_vcpu *vcpu) +{ + return forward_traps(vcpu, HCR_TSC); +} + static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) { u64 mode = spsr & PSR_MODE_MASK; @@ -2087,49 +2450,54 @@ static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) { - u64 spsr, elr, mode; - bool direct_eret; + u64 spsr, elr, esr; /* - * Going through the whole put/load motions is a waste of time - * if this is a VHE guest hypervisor returning to its own - * userspace, or the hypervisor performing a local exception - * return. No need to save/restore registers, no need to - * switch S2 MMU. Just do the canonical ERET. + * Forward this trap to the virtual EL2 if the virtual + * HCR_EL2.NV bit is set and this is coming from !EL2. */ + if (forward_traps(vcpu, HCR_NV)) + return; + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); spsr = kvm_check_illegal_exception_return(vcpu, spsr); - mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); - - direct_eret = (mode == PSR_MODE_EL0t && - vcpu_el2_e2h_is_set(vcpu) && - vcpu_el2_tge_is_set(vcpu)); - direct_eret |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); - - if (direct_eret) { - *vcpu_pc(vcpu) = vcpu_read_sys_reg(vcpu, ELR_EL2); - *vcpu_cpsr(vcpu) = spsr; - trace_kvm_nested_eret(vcpu, *vcpu_pc(vcpu), spsr); - return; + /* Check for an ERETAx */ + esr = kvm_vcpu_get_esr(vcpu); + if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { + /* + * Oh no, ERETAx failed to authenticate. + * + * If we have FPACCOMBINE and we don't have a pending + * Illegal Execution State exception (which has priority + * over FPAC), deliver an exception right away. + * + * Otherwise, let the mangled ELR value trickle down the + * ERET handling, and the guest will have a little surprise. + */ + if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { + esr &= ESR_ELx_ERET_ISS_ERETA; + esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); + kvm_inject_nested_sync(vcpu, esr); + return; + } } preempt_disable(); kvm_arch_vcpu_put(vcpu); - elr = __vcpu_sys_reg(vcpu, ELR_EL2); + if (!esr_iss_is_eretax(esr)) + elr = __vcpu_sys_reg(vcpu, ELR_EL2); trace_kvm_nested_eret(vcpu, elr, spsr); - /* - * Note that the current exception level is always the virtual EL2, - * since we set HCR_EL2.NV bit only when entering the virtual EL2. - */ *vcpu_pc(vcpu) = elr; *vcpu_cpsr(vcpu) = spsr; kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + + kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, @@ -2212,6 +2580,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + kvm_pmu_nested_transition(vcpu); + return 1; } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 24471ec60bd99d6537124509563814c99b5c432e..caf1a72d1b157dfecbed8e320bd574ce3881de5c 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -14,19 +14,6 @@ #include #include -void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu) -{ - struct task_struct *p = vcpu->arch.parent_task; - struct user_fpsimd_state *fpsimd; - - if (!is_protected_kvm_enabled() || !p) - return; - - fpsimd = &p->thread.uw.fpsimd_state; - kvm_unshare_hyp(fpsimd, fpsimd + 1); - put_task_struct(p); -} - /* * Called on entry to KVM_RUN unless this vcpu previously ran at least * once and the most recent prior KVM_RUN for this vcpu was called from @@ -38,28 +25,18 @@ void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu) */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) { - int ret; - struct user_fpsimd_state *fpsimd = ¤t->thread.uw.fpsimd_state; + int ret; - kvm_vcpu_unshare_task_fp(vcpu); + /* pKVM has its own tracking of the host fpsimd state. */ + if (is_protected_kvm_enabled()) + return 0; /* Make sure the host task fpsimd state is visible to hyp: */ ret = kvm_share_hyp(fpsimd, fpsimd + 1); if (ret) return ret; - /* - * We need to keep current's task_struct pinned until its data has been - * unshared with the hypervisor to make sure it is not re-used by the - * kernel and donated to someone else while already shared -- see - * kvm_vcpu_unshare_task_fp() for the matching put_task_struct(). - */ - if (is_protected_kvm_enabled()) { - get_task_struct(current); - vcpu->arch.parent_task = current; - } - return 0; } @@ -87,7 +64,13 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) */ fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); + + /* + * If normal guests gain SME support, maintain this behavior for pKVM + * guests, which don't support SME. + */ + WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && + read_sysreg_s(SYS_SVCR)); } /* @@ -116,8 +99,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!irqs_disabled()); - if (*host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED) { - + if (guest_owns_fp_regs()) { /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. @@ -153,7 +135,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - if (*host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED) { + if (guest_owns_fp_regs()) { /* * Flush (save and invalidate) the fpsimd/sve state so that if * the host tries to use fpsimd/sve, it's not using stale data diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 882d3f2eec9495dce132e56a15550d1ad1975f76..7c0b853a9fc8773eaa83fd8d21230f771386dd0d 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -55,6 +55,13 @@ static int handle_hvc(struct kvm_vcpu *vcpu) static int handle_smc(struct kvm_vcpu *vcpu) { + /* + * Forward this trapped smc instruction to the virtual EL2 if + * the guest has asked for it. + */ + if (forward_smc_trap(vcpu)) + return 1; + /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a @@ -87,11 +94,19 @@ static int handle_smc(struct kvm_vcpu *vcpu) } /* - * Guest access to FP/ASIMD registers are routed to this handler only - * when the system doesn't support FP/ASIMD. + * This handles the cases where the system does not support FP/ASIMD or when + * we are running nested virtualization and the guest hypervisor is trapping + * FP/ASIMD accesses by its guest guest. + * + * All other handling of guest vs. host FP/ASIMD register state is handled in + * fixup_guest_exit(). */ -static int handle_no_fpsimd(struct kvm_vcpu *vcpu) +static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) { + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + /* This is the case when the system doesn't support FP/ASIMD. */ kvm_inject_undefined(vcpu); return 1; } @@ -214,24 +229,48 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) */ static int handle_sve(struct kvm_vcpu *vcpu) { + if (guest_hyp_sve_traps_enabled(vcpu)) + return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + kvm_inject_undefined(vcpu); return 1; } /* - * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into - * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all - * that we can do is give the guest an UNDEF. + * Two possibilities to handle a trapping ptrauth instruction: + * + * - Guest usage of a ptrauth instruction (which the guest EL1 did not + * turn into a NOP). If we get here, it is because we didn't enable + * ptrauth for the guest. This results in an UNDEF, as it isn't + * supposed to use ptrauth without being told it could. + * + * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for + * which we reinject the exception into L1. + * + * Anything else is an emulation bug (hence the WARN_ON + UNDEF). */ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { + if (!vcpu_has_ptrauth(vcpu)) { + kvm_inject_undefined(vcpu); + return 1; + } + + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return 1; + } + + /* Really shouldn't be here! */ + WARN_ON_ONCE(1); kvm_inject_undefined(vcpu); return 1; } static int kvm_handle_eret(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_ERET_ISS_ERET) + if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) && + !vcpu_has_ptrauth(vcpu)) return kvm_handle_ptrauth(vcpu); /* @@ -288,7 +327,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, - [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, }; diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 61e6f3ba7b7d17b5d62e3eedf834cdfc1494417d..e950875e31cee4df58d041519b7584356463c91b 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state) sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + mov x2, #1 + sve_save 0, x1, x2, 3 + ret +SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9ddcfe2c3e574fc8d85b01d91f244f2091350247..487c06099d6fc1e01e1f5218916038cd83010056 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -27,7 +27,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + if (!__kvm_at(OP_AT_S1E1R, far)) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ @@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) */ if (!(esr & ESR_ELx_S1PTW) && (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { + esr_fsc_is_permission_fault(esr))) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 76cdda3e97bb2cd3600cc92c448c82890bd79101..88bd2c29116a8d77110c37394a97fb474caff03b 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -38,12 +39,6 @@ struct kvm_exception_table_entry { extern struct kvm_exception_table_entry __start___kvm_ex_table; extern struct kvm_exception_table_entry __stop___kvm_ex_table; -/* Check whether the FP regs are owned by the guest */ -static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) -{ - return *host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED; -} - /* Save the 32-bit only FPSIMD system register state */ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) { @@ -304,10 +299,8 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) __deactivate_traps_mpam(); } -static inline void ___activate_traps(struct kvm_vcpu *vcpu) +static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) { - u64 hcr = vcpu->arch.hcr_el2; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) hcr |= HCR_TVM; @@ -338,17 +331,31 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { + /* + * The vCPU's saved SVE state layout always matches the max VL of the + * vCPU. Start off with the max VL so we can load the SVE state. + */ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), - &vcpu->arch.ctxt.fp_regs.fpsr); - write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); + &vcpu->arch.ctxt.fp_regs.fpsr, + true); + + /* + * The effective VL for a VM could differ from the max VL when running a + * nested guest, as the guest hypervisor could select a smaller VL. Slap + * that into hardware before wrapping up. + */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + + write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); } static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) { u64 zcr_el1, zcr_el2; - if (!guest_owns_fp_regs(vcpu)) + if (!guest_owns_fp_regs()) return; if (vcpu_has_sve(vcpu)) { @@ -356,7 +363,7 @@ static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) write_sysreg_el2(zcr_el2, SYS_ZCR); - zcr_el1 = __vcpu_sys_reg(vcpu, ZCR_EL1); + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); write_sysreg_el1(zcr_el1, SYS_ZCR); } } @@ -365,7 +372,7 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) { u64 zcr_el1, zcr_el2; - if (!guest_owns_fp_regs(vcpu)) + if (!guest_owns_fp_regs()) return; /* @@ -378,7 +385,7 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) */ if (vcpu_has_sve(vcpu)) { zcr_el1 = read_sysreg_el1(SYS_ZCR); - __vcpu_sys_reg(vcpu, ZCR_EL1) = zcr_el1; + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; /* * The guest's state is always saved using the guest's max VL. @@ -399,6 +406,40 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) } } +static inline void __hyp_sve_save_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_ELx_ZEN, 0); + + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + + /* * We trap the first access to the FP/SIMD to save the host context and * restore the guest context lazily. @@ -409,7 +450,6 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; - u64 reg; if (!system_supports_fpsimd()) return false; @@ -420,10 +460,19 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Only handle traps the vCPU can support here: */ switch (esr_ec) { case ESR_ELx_EC_FP_ASIMD: + /* Forward traps to the guest hypervisor as required */ + if (guest_hyp_fpsimd_traps_enabled(vcpu)) + return false; break; + case ESR_ELx_EC_SYS64: + if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu))) + return false; + fallthrough; case ESR_ELx_EC_SVE: if (!sve_guest) return false; + if (guest_hyp_sve_traps_enabled(vcpu)) + return false; break; default: return false; @@ -432,21 +481,16 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe() || has_hvhe()) { - reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; - if (sve_guest) - reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - - sysreg_clear_set(cpacr_el1, 0, reg); - } else { - reg = CPTR_EL2_TFP; - if (sve_guest) - reg |= CPTR_EL2_TZ; - - sysreg_clear_set(cptr_el2, reg, 0); - } + if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) + cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + else + cpacr_clear_set(0, CPACR_ELx_FPEN); isb(); + /* Write out the host state if it's in the registers */ + if (is_protected_kvm_enabled() && host_owns_fp_regs()) + kvm_hyp_save_fpsimd_host(vcpu); + /* Restore the guest state */ if (sve_guest) __hyp_sve_restore_guest(vcpu); @@ -517,65 +561,25 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } -static inline bool esr_is_ptrauth_trap(u64 esr) +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) { - switch (esr_sys64_to_sysreg(esr)) { - case SYS_APIAKEYLO_EL1: - case SYS_APIAKEYHI_EL1: - case SYS_APIBKEYLO_EL1: - case SYS_APIBKEYHI_EL1: - case SYS_APDAKEYLO_EL1: - case SYS_APDAKEYHI_EL1: - case SYS_APDBKEYLO_EL1: - case SYS_APDBKEYHI_EL1: - case SYS_APGAKEYLO_EL1: - case SYS_APGAKEYHI_EL1: - return true; - } + u64 offset = 0; - return false; + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); + + return offset; } -#define __ptrauth_save_key(ctxt, key) \ - do { \ - u64 __val; \ - __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \ - ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val; \ - __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \ - ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ -} while(0) - -#ifdef MODULE -extern struct kvm_cpu_context __percpu *kvm_hyp_ctxt; -#else -DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); -#endif - -static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) { - struct kvm_cpu_context *ctxt; - u64 val; - - if (!vcpu_has_ptrauth(vcpu)) - return false; - - ctxt = this_cpu_ptr_wrapper(kvm_hyp_ctxt); - __ptrauth_save_key(ctxt, APIA); - __ptrauth_save_key(ctxt, APIB); - __ptrauth_save_key(ctxt, APDA); - __ptrauth_save_key(ctxt, APDB); - __ptrauth_save_key(ctxt, APGA); - - vcpu_ptrauth_enable(vcpu); - - val = read_sysreg(hcr_el2); - val |= (HCR_API | HCR_APK); - write_sysreg(val, hcr_el2); - - return true; + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); } -static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) { struct arch_timer_context *ctxt; u32 sysreg; @@ -585,18 +589,19 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) * We only get here for 64bit guests, 32bit guests will hit * the long and winding road all the way to the standard * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). */ + if (is_hyp_ctxt(vcpu)) + return false; + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); switch (sysreg) { case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (vcpu_has_nv(vcpu)) { - if (is_hyp_ctxt(vcpu)) { - ctxt = vcpu_hptimer(vcpu); - break; - } - /* Check for guest hypervisor trapping */ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) @@ -608,16 +613,23 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) ctxt = vcpu_ptimer(vcpu); break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; default: return false; } - val = arch_timer_read_cntpct_el0(); - - if (ctxt->offset.vm_offset) - val -= *kern_hyp_va(ctxt->offset.vm_offset); - if (ctxt->offset.vcpu_offset) - val -= *kern_hyp_va(ctxt->offset.vcpu_offset); + val = compute_counter_value(ctxt); vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); @@ -662,10 +674,7 @@ static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; - if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) - return kvm_hyp_handle_ptrauth(vcpu, exit_code); - - if (kvm_hyp_handle_cntpct(vcpu)) + if (kvm_handle_cntxct(vcpu)) return true; return false; @@ -699,7 +708,7 @@ static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; - valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && + valid = kvm_vcpu_trap_is_translation_fault(vcpu) && kvm_vcpu_dabt_isvalid(vcpu) && !kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 4be6a7fa007082ac008c83422e9bb69b0f5da324..41e23b769e8c38f2d33255272e23af4a9290dd40 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -115,9 +115,10 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); } -static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, + u64 mpidr) { - write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2); + write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 20c3f6e13b99f4750fdaf1c9b157c8f6ef62ebba..24a9a8330d190396e90abc7202e6b7e86ea1de63 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -53,8 +53,12 @@ pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); } +static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return vcpu_is_protected(&hyp_vcpu->vcpu); +} + void pkvm_hyp_vm_table_init(void *tbl); -void pkvm_host_fpsimd_state_init(void); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 45a84f0ade04bd6840bd11e1ee3bf70eabe4ff0b..1e6d995968a1fbd1c93cc80bfe3693da917a4adb 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,6 +15,4 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); - #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 0f9d8422cf808b582152af5b392e7407c8bbe012..cae00e7237efbc255539dbc3180b63b14efdc106 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -129,11 +129,7 @@ alternative_if ARM64_HAS_CNP alternative_else_nop_endif msr ttbr0_el2, x2 - /* - * Set the PS bits in TCR_EL2. - */ ldr x0, [x0, #NVHE_INIT_TCR_EL2] - tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 msr tcr_el2, x0 isb diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 72d9edb7dd9b3a3d8f8c679963840220f83301df..9cd57bee835c92f1834f484e34878cee600fe2c1 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -24,19 +24,82 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) +{ + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + /* + * On saving/restoring guest sve state, always use the maximum VL for + * the guest. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) guest VL. + */ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); +} + +static void __hyp_sve_restore_host(void) +{ + struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); + + /* + * On saving/restoring host sve state, always use the maximum VL for + * the host. The layout of the data when saving the sve state depends + * on the VL, so use a consistent (i.e., the maximum) host VL. + * + * Setting ZCR_EL2 to ZCR_ELx_LEN_MASK sets the effective length + * supported by the system (or limited at EL3). + */ + write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), + &sve_state->fpsr, + true); + write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR); +} + +static void fpsimd_sve_flush(void) +{ + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + return; + + cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + isb(); + + if (vcpu_has_sve(vcpu)) + __hyp_sve_save_guest(vcpu); + else + __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + + if (system_supports_sve()) + __hyp_sve_restore_host(); + else + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); + + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + fpsimd_sve_flush(); + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; + /* Limit guest vector length to the maximum supported by the host. */ + hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; @@ -54,6 +117,8 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; unsigned int i; + fpsimd_sve_sync(&hyp_vcpu->vcpu); + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; @@ -78,6 +143,17 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vcpu *hyp_vcpu; struct kvm *host_kvm; + /* + * KVM (and pKVM) doesn't support SME guests for now, and + * ensures that SME features aren't enabled in pstate when + * loading a vcpu. Therefore, if SME features enabled the host + * is misbehaving. + */ + if (unlikely(system_supports_sme() && read_sysreg_s(SYS_SVCR))) { + ret = -EINVAL; + goto out; + } + host_kvm = kern_hyp_va(host_vcpu->kvm); hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, host_vcpu->vcpu_idx); @@ -176,33 +252,23 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } -static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr(); -} - -static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt) -{ - __vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1)); -} - static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) { __vgic_v3_init_lrs(); } -static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if)); } -static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if)); } static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) @@ -273,13 +339,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } -static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); - - __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); -} - static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -332,11 +391,8 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_read_vmcr), - HANDLE_FUNC(__vgic_v3_write_vmcr), - HANDLE_FUNC(__vgic_v3_save_aprs), - HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_vcpu_init_traps), + HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8d0a5834e8830059d43d464c4187ff63990e9770..caba3e4bd09e8405a4138a8d61c59375187550e0 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -91,7 +91,7 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_unlinked_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, s8 level) { kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } @@ -443,7 +443,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) { struct kvm_mem_range cur; kvm_pte_t pte; - u32 level; + s8 level; int ret; hyp_assert_lock_held(&host_mmu.lock); @@ -462,7 +462,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; level++; - } while ((level < KVM_PGTABLE_MAX_LEVELS) && + } while ((level <= KVM_PGTABLE_LAST_LEVEL) && !(kvm_level_supports_block_mapping(level) && range_included(&cur, range))); @@ -533,7 +533,13 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) int ret = 0; esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + if (!__get_fault_info(esr, &fault)) { + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + return; + } addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; ret = host_stage2_idmap(addr); diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 65a7a186d7b217e599688f465b9d49b64ee2b9f0..b01a3d1078a8803f061496044066d749f9881f94 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -260,7 +260,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL); dsb(ish); isb(); } @@ -275,7 +275,7 @@ static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, { struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1) + if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; slot->addr = ctx->addr; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 93ac7210cfcbbc9b3e1fe2fa6c02100c20daa942..93dc3afa6a5141e5887a5709dda971f26252e3f3 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -6,6 +6,9 @@ #include #include + +#include + #include #include #include @@ -174,11 +177,45 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); } +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + + if (has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); +} + /* - * Initialize trap register values for protected VMs. + * Initialize trap register values in protected mode. */ -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) { + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!vcpu_is_protected(vcpu))) + return; + pvm_init_trap_regs(vcpu); pvm_init_traps_aa64pfr0(vcpu); pvm_init_traps_aa64pfr1(vcpu); @@ -222,17 +259,6 @@ void pkvm_hyp_vm_table_init(void *tbl) vm_table = tbl; } -void pkvm_host_fpsimd_state_init(void) -{ - unsigned long i; - - for (i = 0; i < hyp_nr_cpus; i++) { - struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); - - host_data->fpsimd_state = &host_data->host_ctxt.fp_regs; - } -} - /* * Return the hyp vm structure corresponding to the handle. */ @@ -273,6 +299,52 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_spin_unlock(&vm_table_lock); } +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + +static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { + kvm_vcpu_enable_ptrauth(vcpu); + } else { + vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); + } +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -294,6 +366,18 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + pkvm_init_features_from_host(hyp_vm, host_kvm); +} + +static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + vcpu_clear_flag(vcpu, GUEST_HAS_SVE); + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + } } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, @@ -319,6 +403,11 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); + pkvm_vcpu_init_ptrauth(hyp_vcpu); + pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); @@ -416,6 +505,7 @@ static void *map_donated_memory(unsigned long host_va, size_t size) static void __unmap_donated_memory(void *va, size_t size) { + kvm_flush_dcache_to_poc(va, size); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), PAGE_ALIGN(size) >> PAGE_SHIFT)); } diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 2fb337b6e7e4882f8319be8f7c69b67a6a69609c..f4350ba07b0b0c37440af0783d313108e1cc0a92 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -67,6 +67,28 @@ static int divide_memory_pool(void *virt, unsigned long size) return 0; } +static int pkvm_create_host_sve_mappings(void) +{ + void *start, *end; + int ret, i; + + if (!system_supports_sve()) + return 0; + + for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_host_data *host_data = per_cpu_ptr(&kvm_host_data, i); + struct cpu_sve_state *sve_state = host_data->sve_state; + + start = kern_hyp_va(sve_state); + end = start + PAGE_ALIGN(pkvm_host_sve_state_size()); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + + return 0; +} + static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits) @@ -125,6 +147,8 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + pkvm_create_host_sve_mappings(); + /* * Map the host sections RO in the hypervisor, but transfer the * ownership from the host to the hypervisor itself to make sure they @@ -181,7 +205,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!kvm_pte_valid(ctx->old)) return 0; - if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) + if (ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; phys = kvm_pte_to_phys(ctx->old); @@ -300,7 +324,6 @@ void __noreturn __pkvm_init_finalise(void) goto out; pkvm_hyp_vm_table_init(vm_table_base); - pkvm_host_fpsimd_state_init(); out: /* * We tail-called to here from handle___pkvm_init() and will not return, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f1970816fa436d8f2b1d67517840de4bdd73b9c6..039be0998a54502e72fe5a610d2720febf580b23 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -40,13 +40,13 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - if (!guest_owns_fp_regs(vcpu)) + if (!guest_owns_fp_regs()) __activate_traps_fpsimd32(vcpu); if (has_hvhe()) { val |= CPACR_ELx_TTA; - if (guest_owns_fp_regs(vcpu)) { + if (guest_owns_fp_regs()) { val |= CPACR_ELx_FPEN; if (vcpu_has_sve(vcpu)) val |= CPACR_ELx_ZEN; @@ -62,10 +62,10 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) */ val |= CPTR_EL2_TSM; - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs(vcpu)) + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) val |= CPTR_EL2_TZ; - if (!guest_owns_fp_regs(vcpu)) + if (!guest_owns_fp_regs()) val |= CPTR_EL2_TFP; write_sysreg(val, cptr_el2); @@ -97,7 +97,7 @@ static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) static void __activate_traps(struct kvm_vcpu *vcpu) { - ___activate_traps(vcpu); + ___activate_traps(vcpu, vcpu->arch.hcr_el2); __activate_traps_common(vcpu); __activate_cptr_traps(vcpu); @@ -228,7 +228,6 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, }; static const exit_handler_fn pvm_exit_handlers[] = { @@ -239,12 +238,11 @@ static const exit_handler_fn pvm_exit_handlers[] = { [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, }; static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) { - if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)))) + if (unlikely(vcpu_is_protected(vcpu))) return pvm_exit_handlers; return hyp_exit_handlers; @@ -253,7 +251,6 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - struct kvm *kvm = kern_hyp_va(vcpu->kvm); synchronize_vcpu_pstate(vcpu, exit_code); @@ -266,7 +263,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) * it. The check below is based on the one in * kvm_arch_vcpu_ioctl_run(). */ - if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) { + if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't * fit for purpose anymore by making the vcpu invalid. The VMM @@ -375,7 +372,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(host_ctxt); - if (*host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED) + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index 29305022bc048f09c4686166153d1784ee8e5c94..dba101565de36718ff63a03503e3941b4430c2be 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,7 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt); + __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 3aaab20ae5b4797182de112d1e641112c3f799c7..ff176f4ce7debf557b5d2d0d4e05af8853003c69 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -22,15 +22,16 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val, shift = 0; + u64 set, clr, shift = 0; if (has_hvhe()) shift = 10; /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; - write_sysreg(val, cnthctl_el2); + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } /* @@ -58,5 +59,12 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) set <<= 10; } + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 574dc96c6845689f8e7dc4f0794c6195d9e36924..446ac1dac49e0b9c44206070be7d9e7b27d48781 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -11,13 +11,23 @@ #include struct tlb_inv_context { - u64 tcr; + struct kvm_s2_mmu *mmu; + u64 tcr; + u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt, - bool nsh) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt, + bool nsh) { + struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + cxt->mmu = NULL; + /* * We have two requirements: * @@ -40,20 +50,55 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, else dsb(ish); + /* + * If we're already in the desired context, then there's nothing to do. + */ + if (vcpu) { + /* + * We're in guest context. However, for this to work, this needs + * to be called from within __kvm_vcpu_run(), which ensures that + * __hyp_running_vcpu is set to the current guest vcpu. + */ + if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu)) + return; + + cxt->mmu = vcpu->arch.hw_mmu; + } else { + /* We're in host context. */ + if (mmu == host_s2_mmu) + return; + + cxt->mmu = host_s2_mmu; + } + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; /* * For CPUs that are affected by ARM 1319367, we need to - * avoid a host Stage-1 walk while we have the guest's - * VMID set in the VTTBR in order to invalidate TLBs. - * We're guaranteed that the S1 MMU is enabled, so we can - * simply set the EPD bits to avoid any further TLB fill. + * avoid a Stage-1 walk with the old VMID while we have + * the new VMID set in the VTTBR in order to invalidate TLBs. + * We're guaranteed that the host S1 MMU is enabled, so + * we can simply set the EPD bits to avoid any further + * TLB fill. For guests, we ensure that the S1 MMU is + * temporarily enabled in the next context. */ val = cxt->tcr = read_sysreg_el1(SYS_TCR); val |= TCR_EPD1_MASK | TCR_EPD0_MASK; write_sysreg_el1(val, SYS_TCR); isb(); + + if (vcpu) { + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); + if (!(val & SCTLR_ELx_M)) { + val |= SCTLR_ELx_M; + write_sysreg_el1(val, SYS_SCTLR); + isb(); + } + } else { + /* The host S1 MMU is always enabled. */ + cxt->sctlr = SCTLR_ELx_M; + } } /* @@ -62,18 +107,40 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + if (vcpu) + __load_host_stage2(); + else + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { - __load_host_stage2(); + struct kvm_s2_mmu *mmu = cxt->mmu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; + + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + + if (!mmu) + return; + + if (vcpu) + __load_stage2(mmu, kern_hyp_va(mmu->arch)); + else + __load_host_stage2(); + + /* Ensure write of the old VMID */ + isb(); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the host VMID */ - isb(); - /* Restore the host's TCR_EL1 */ + if (!(cxt->sctlr & SCTLR_ELx_M)) { + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); + isb(); + } + write_sysreg_el1(cxt->tcr, SYS_TCR); } } @@ -84,7 +151,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); /* * We could do so much better if we had the VA as well. @@ -127,7 +194,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, if (icache_is_vpipt()) icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, @@ -136,7 +203,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, true); + enter_vmid_context(mmu, &cxt, true); /* * We could do so much better if we had the VA as well. @@ -179,7 +246,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, if (icache_is_vpipt()) icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, @@ -196,7 +263,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, start = round_down(start, stride); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, TLBI_TTL_UNKNOWN); @@ -210,7 +277,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, if (icache_is_vpipt()) icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) @@ -218,13 +285,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -232,19 +299,19 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt, false); + enter_vmid_context(mmu, &cxt, false); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) { - /* Same remark as in __tlb_switch_to_guest() */ + /* Same remark as in enter_vmid_context() */ dsb(ish); __tlbi(alle1is); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9c20e5ac71ad6832ddc8c0b85eb64fcc14e23011..1a7f463c82d26e2f5d175bb49efa20249138ee7b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -79,7 +79,10 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) static bool kvm_phys_is_valid(u64 phys) { - return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); + u64 parange_max = kvm_get_parange_max(); + u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); + + return phys < BIT(shift); } static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) @@ -98,7 +101,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, return IS_ALIGNED(ctx->addr, granule); } -static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) { u64 shift = kvm_granule_shift(level); u64 mask = BIT(PAGE_SHIFT - 3) - 1; @@ -114,7 +117,7 @@ static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) return (addr & mask) >> shift; } -static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) { struct kvm_pgtable pgt = { .ia_bits = ia_bits, @@ -124,9 +127,9 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_table(kvm_pte_t pte, u32 level) +static bool kvm_pte_table(kvm_pte_t pte, s8 level) { - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return false; if (!kvm_pte_valid(pte)) @@ -154,11 +157,11 @@ static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops return pte; } -static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) { kvm_pte_t pte = kvm_phys_to_pte(pa); - u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : - KVM_PTE_TYPE_BLOCK; + u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); pte |= FIELD_PREP(KVM_PTE_TYPE, type); @@ -203,11 +206,11 @@ static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, - kvm_pteref_t pteref, u32 level) + kvm_pteref_t pteref, s8 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); @@ -272,12 +275,13 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) { u32 idx; int ret = 0; - if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { @@ -340,7 +344,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct leaf_walk_data { kvm_pte_t pte; - u32 level; + s8 level; }; static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -355,7 +359,7 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, } int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level) + kvm_pte_t *ptep, s8 *level) { struct leaf_walk_data data; struct kvm_pgtable_walker walker = { @@ -408,7 +412,8 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -467,7 +472,7 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, if (hyp_map_walker_try_leaf(ctx, data)) return 0; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); @@ -563,14 +568,19 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { - u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - + ARM64_HW_PGTABLE_LEVELS(va_bits); + + if (start_level < KVM_PGTABLE_FIRST_LEVEL || + start_level > KVM_PGTABLE_LAST_LEVEL) + return -EINVAL; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; - pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = NULL; pgt->force_pte_cb = NULL; @@ -624,7 +634,7 @@ struct stage2_map_data { u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) { u64 vtcr = VTCR_EL2_FLAGS; - u8 lvls; + s8 lvls; vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); @@ -635,6 +645,15 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2) lvls = 2; + + /* + * When LPA2 is enabled, the HW supports an extra level of translation + * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 + * to as an addition to SL0 to enable encoding this extra start level. + * However, since we always use concatenated pages for the first level + * lookup, we will never need this extra level and therefore do not need + * to touch SL2. + */ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); #ifdef CONFIG_ARM64_HW_AFDBM @@ -654,6 +673,9 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ + if (kvm_lpa2_is_enabled()) + vtcr |= VTCR_EL2_DS; + /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS_16BIT : @@ -725,7 +747,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p if (prot & KVM_PGTABLE_PROT_W) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -890,12 +914,12 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == KVM_S2_MEMATTR(pgt, NORMAL); + return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static bool stage2_pte_executable(kvm_pte_t pte) { - return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); + return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, @@ -923,7 +947,7 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, { u64 phys = stage2_map_walker_phys_addr(ctx, data); - if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) + if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; return kvm_block_mapping_supported(ctx, phys); @@ -955,6 +979,21 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN; + /* If we're only changing software bits, then store them and go! */ + if (!kvm_pgtable_walk_shared(ctx) && + !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { + bool old_is_counted = stage2_pte_is_counted(ctx->old); + + if (old_is_counted != stage2_pte_is_counted(new)) { + if (old_is_counted) + mm_ops->put_page(ctx->ptep); + else + mm_ops->get_page(ctx->ptep); + } + WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); + return 0; + } + if (!stage2_try_break_pte(ctx, data->mmu)) return -EAGAIN; @@ -1002,7 +1041,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (ret != -E2BIG) return ret; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; if (!data->memcache) @@ -1172,7 +1211,7 @@ struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; - u32 level; + s8 level; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -1215,7 +1254,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, - u32 *level, enum kvm_pgtable_walk_flags flags) + s8 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; @@ -1317,7 +1356,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot) { int ret; - u32 level; + s8 level; kvm_pte_t set = 0, clr = 0; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) @@ -1346,7 +1385,7 @@ static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) + if (!stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) @@ -1370,7 +1409,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) } kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, - u64 phys, u32 level, + u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { @@ -1428,7 +1467,7 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, * fully populated tree up to the PTE entries. Note that @level is * interpreted as in "level @level entry". */ -static int stage2_block_get_nr_page_tables(u32 level) +static int stage2_block_get_nr_page_tables(s8 level) { switch (level) { case 1: @@ -1439,7 +1478,7 @@ static int stage2_block_get_nr_page_tables(u32 level) return 0; default: WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || - level >= KVM_PGTABLE_MAX_LEVELS); + level > KVM_PGTABLE_LAST_LEVEL); return -EINVAL; }; } @@ -1452,13 +1491,13 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu; kvm_pte_t pte = ctx->old, new, *childp; enum kvm_pgtable_prot prot; - u32 level = ctx->level; + s8 level = ctx->level; bool force_pte; int nr_pages; u64 phys; /* No huge-pages exist at the last level */ - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return 0; /* We only split valid block mappings */ @@ -1535,7 +1574,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, u64 vtcr = mmu->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); @@ -1558,7 +1597,7 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) { u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } @@ -1594,7 +1633,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 6cb638b184b1832ea43fa146bd5816dce73aa8e0..3f9741e51d41bf57bdb90fd99d96b35725723a1f 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!cpu_if->vgic_sre) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); @@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); @@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * injected. Note that this also applies if we don't expect + * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } @@ -326,11 +336,11 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } -void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -363,7 +373,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) } } -void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) +static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -455,16 +465,35 @@ u64 __vgic_v3_get_gic_config(void) return val; } -u64 __vgic_v3_read_vmcr(void) +static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); } -void __vgic_v3_write_vmcr(u32 vmcr) +static void __vgic_v3_write_vmcr(u32 vmcr) { write_gicreg(vmcr, ICH_VMCR_EL2); } +void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + __vgic_v3_save_aprs(cpu_if); + if (cpu_if->vgic_sre) + cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); +} + +void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) +{ + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (cpu_if->vgic_sre) + __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); + __vgic_v3_restore_aprs(cpu_if); +} + static int __vgic_v3_bpr_min(void) { /* See Pseudocode for VPriorityGroup */ @@ -983,9 +1012,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) - val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ @@ -1013,6 +1039,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) write_gicreg(vmcr, ICH_VMCR_EL2); } +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_TC; + + default: + return false; + } +} + int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; @@ -1022,6 +1117,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) bool is_read; u32 sysreg; + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { @@ -1036,6 +1134,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index a2fda432820f1182cad76ab240f0d15042f31b72..a85e257fdd116a918e19a7b1676ce28288fff53a 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -38,11 +38,114 @@ DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); #endif DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +/* + * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1 + * semantics, irrespective of the configuration), but that cannot be + * applied to the actual HW as things would otherwise break badly. + * + * - TGE: we want the guest to use EL1, which is incompatible with + * this bit being set + * + * - API/APK: they are already accounted for by vcpu_load(), and can + * only take effect across a load/put cycle (such as ERET) + */ +#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK) + +static u64 __compute_hcr(struct kvm_vcpu *vcpu) +{ + u64 hcr = vcpu->arch.hcr_el2; + + if (!vcpu_has_nv(vcpu)) + return hcr; + + if (is_hyp_ctxt(vcpu)) { + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; + + if (!vcpu_el2_e2h_is_set(vcpu)) + hcr |= HCR_NV1; + + write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); + } + + return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); +} + +static void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 cptr; + + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPACR_ELx_TTA | CPTR_EL2_TAM; + + if (guest_owns_fp_regs()) { + val |= CPACR_ELx_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_ELx_ZEN; + } else { + __activate_traps_fpsimd32(vcpu); + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_ELx, FPEN, cptr) & BIT(0))) + val &= ~CPACR_ELx_FPEN; + if (!(SYS_FIELD_GET(CPACR_ELx, ZEN, cptr) & BIT(0))) + val &= ~CPACR_ELx_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_ELx_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; - ___activate_traps(vcpu); + ___activate_traps(vcpu, __compute_hcr(vcpu)); if (has_cntpoff()) { struct timer_map map; @@ -64,31 +167,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) } } - val = read_sysreg(cpacr_el1); - val |= CPACR_ELx_TTA; - val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | - CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - - val |= CPTR_EL2_TAM; - - if (guest_owns_fp_regs(vcpu)) { - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; - } else { - val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); - __activate_traps_fpsimd32(vcpu); - } - - write_sysreg(val, cpacr_el1); + __activate_cptr_traps(vcpu); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } @@ -167,6 +246,8 @@ static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu) void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) { + host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu; + __vcpu_load_switch_sysregs(vcpu); __vcpu_load_activate_traps(vcpu); __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); @@ -176,18 +257,281 @@ void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) { __vcpu_put_deactivate_traps(vcpu); __vcpu_put_switch_sysregs(vcpu); + + host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; +} + +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 spsr, elr, mode; + + /* + * Going through the whole put/load motions is a waste of time + * if this is a VHE guest hypervisor returning to its own + * userspace, or the hypervisor performing a local exception + * return. No need to save/restore registers, no need to + * switch S2 MMU. Just do the canonical ERET. + * + * Unless the trap has to be forwarded further down the line, + * of course... + */ + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) || + (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET)) + return false; + + spsr = read_sysreg_el1(SYS_SPSR); + mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL0t: + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + return false; + break; + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + default: + return false; + } + + /* If ERETAx fails, take the slow path */ + if (esr_iss_is_eretax(esr)) { + if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr))) + return false; + } else { + elr = read_sysreg_el1(SYS_ELR); + } + + spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; + + write_sysreg_el2(spsr, SYS_SPSR); + write_sysreg_el2(elr, SYS_ELR); + + return true; +} + +static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + int ret = -EINVAL; + u32 instr; + u64 val; + + /* + * Ideally, we would never trap on EL2 S1 TLB invalidations using + * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}. + * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2, + * meaning that we can't track changes to the virtual TGE bit. So we + * have to leave HCR_EL2.TTLB set on the host. Oopsie... + * + * Try and handle these invalidation as quickly as possible, without + * fully exiting. Note that we don't need to consider any forwarding + * here, as having E2H+TGE set is the very definition of being + * InHost. + * + * For the lesser hypervisors out there that have failed to get on + * with the VHE program, we can also handle the nVHE style of EL2 + * invalidation. + */ + if (!(is_hyp_ctxt(vcpu))) + return false; + + instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) && + vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) || + kvm_supported_tlbi_s1e2_op (vcpu, instr)) + ret = __kvm_tlbi_s1e2(NULL, val, instr); + + if (ret) + return false; + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr = kvm_vcpu_get_esr(vcpu); + int rt; + + if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1) + return false; + + rt = kvm_vcpu_sys_get_rt(vcpu); + + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) { + vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2)); + } else { + vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2); + __activate_cptr_traps(vcpu); + } + + __kvm_skip_instr(vcpu); + + return true; +} + +static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + if (!vcpu_has_nv(vcpu)) + return false; + + if (sysreg != SYS_ZCR_EL2) + return false; + + if (guest_owns_fp_regs()) + return false; + + /* + * ZCR_EL2 traps are handled in the slow path, with the expectation + * that the guest's FP context has already been loaded onto the CPU. + * + * Load the guest's FP context and unconditionally forward to the + * slow path for handling (i.e. return false). + */ + kvm_hyp_handle_fpsimd(vcpu, exit_code); + return false; +} + +static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) + return true; + + if (kvm_hyp_handle_zcr_el2(vcpu, exit_code)) + return true; + + return kvm_hyp_handle_sysreg(vcpu, exit_code); } static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, - [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe, [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, - [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, + [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, }; static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) @@ -231,7 +575,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) u64 exit_code; host_ctxt = host_data_ptr(host_ctxt); - host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; sysreg_save_host_state_vhe(host_ctxt); @@ -266,7 +609,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_host_state_vhe(host_ctxt); - if (*host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED) + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 33bcaa03da89459a74e05d6d74107af5b3a9fac4..cf1a8ea5d8f373da720746fb6a7a71cf72d48952 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -15,6 +15,107 @@ #include #include +static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) +{ + /* These registers are common with EL1 */ + __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); + __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); + + __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); + __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); + __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); + __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); + __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); + __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); + __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); + __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (vcpu_el2_e2h_is_set(vcpu)) { + u64 val; + + /* + * We don't save CPTR_EL2, as accesses to CPACR_EL1 + * are always trapped, ensuring that the in-memory + * copy is always up-to-date. A small blessing... + */ + __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); + __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); + __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); + __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + + /* + * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where + * the interesting CNTHCTL_EL2 bits live. So preserve these + * bits when reading back the guest-visible value. + */ + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + } + + __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); + __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); + __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); +} + +static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* These registers are common with EL1 */ + write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); + write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); + + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + + if (vcpu_el2_e2h_is_set(vcpu)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); + } else { + /* + * CNTHCTL_EL2 only affects EL1 when running nVHE, so + * no need to restore it. + */ + val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); + write_sysreg_el1(val, SYS_SCTLR); + val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); + write_sysreg_el1(val, SYS_CPACR); + val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); + write_sysreg_el1(val, SYS_TTBR0); + val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); + write_sysreg_el1(val, SYS_TCR); + } + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); + write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); +} + /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every @@ -81,6 +182,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; + u64 mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); @@ -105,7 +207,29 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); __mpam_guest_load(); - __sysreg_restore_el1_state(guest_ctxt); + + if (unlikely(is_hyp_ctxt(vcpu))) { + __sysreg_restore_vel2_state(vcpu); + } else { + if (vcpu_has_nv(vcpu)) { + /* + * Use the guest hypervisor's VPIDR_EL2 when in a + * nested state. The hardware value of MIDR_EL1 gets + * restored on put. + */ + write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); + + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); + } else { + mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); + } + + __sysreg_restore_el1_state(guest_ctxt, mpidr); + } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } @@ -128,12 +252,20 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); - __sysreg_save_el1_state(guest_ctxt); + if (unlikely(is_hyp_ctxt(vcpu))) + __sysreg_save_vel2_state(vcpu); + else + __sysreg_save_el1_state(guest_ctxt); + __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); + /* If leaving a nesting guest, restore MIDR_EL1 default view */ + if (vcpu_has_nv(vcpu)) + write_sysreg(read_cpuid_id(), vpidr_el2); + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 1cd02da793b86d5cf13d730319de48f707148f10..87e54300caeef02d905a75ed84b79b3ea9a9e796 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -17,8 +17,8 @@ struct tlb_inv_context { u64 sctlr; }; -static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt) +static void enter_vmid_context(struct kvm_s2_mmu *mmu, + struct tlb_inv_context *cxt) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); u64 val; @@ -67,7 +67,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, isb(); } -static void __tlb_switch_to_host(struct tlb_inv_context *cxt) +static void exit_vmid_context(struct tlb_inv_context *cxt) { /* * We're done with the TLB operation, let's restore the host's @@ -97,7 +97,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. @@ -118,7 +118,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, @@ -129,7 +129,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, dsb(nshst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. @@ -150,7 +150,7 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, @@ -169,7 +169,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, TLBI_TTL_UNKNOWN); @@ -179,7 +179,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) @@ -189,13 +189,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) @@ -203,14 +203,14 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + enter_vmid_context(mmu, &cxt); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); - __tlb_switch_to_host(&cxt); + exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) @@ -232,3 +232,150 @@ void __kvm_flush_vm_context(void) dsb(ish); } + +/* + * TLB invalidation emulation for NV. For any given instruction, we + * perform the following transformtions: + * + * - a TLBI targeting EL2 S1 is remapped to EL1 S1 + * - a non-shareable TLBI is upgraded to being inner-shareable + * - an outer-shareable TLBI is also mapped to inner-shareable + * - an nXS TLBI is upgraded to XS + */ +int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) +{ + struct tlb_inv_context cxt; + int ret = 0; + + /* + * The guest will have provided its own DSB ISHST before trapping. + * If it hasn't, that's its own problem, and we won't paper over it + * (plus, there is plenty of extra synchronisation before we even + * get here...). + */ + + if (mmu) + enter_vmid_context(mmu, &cxt); + + switch (sys_encoding) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + __tlbi(vmalle1is); + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + __tlbi(vae1is, va); + break; + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + __tlbi(vale1is, va); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + __tlbi(aside1is, va); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + __tlbi(vaae1is, va); + break; + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + __tlbi(vaale1is, va); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + __tlbi(rvae1is, va); + break; + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + __tlbi(rvale1is, va); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + __tlbi(rvaae1is, va); + break; + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + __tlbi(rvaale1is, va); + break; + default: + ret = -EINVAL; + } + dsb(ish); + isb(); + + if (mmu) + exit_vmid_context(&cxt); + + return ret; +} diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index cbf6c554acb9e767ee2d43d69effb8ff30a80238..2d849cf777478ff6a7153c3fe468704137bf3e98 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -605,7 +605,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { bool wants_02; - wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); + wants_02 = vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2); switch (val) { case KVM_ARM_PSCI_0_1: diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 2aa503ff742ee57e5023453e7d143d93fe909121..130a77da0c2fa8c153a9bee32d51515da11d89ec 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -161,8 +161,16 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. + * + * In the protected VM case, there isn't much userspace can do + * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { + if (vcpu_is_protected(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ce9fea4d61b3a225c1fd6056da8272d0f6cbc775..3a31d29e6dbd6a45cf0ba8555766dfe4a5cfa282 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -220,12 +220,12 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); - u32 level = page_private(page); + s8 level = page_private(page); kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); } -static void stage2_free_unlinked_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, s8 level) { struct page *page = virt_to_page(addr); @@ -327,9 +327,15 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block); +} + +void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +{ + stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush); } static void stage2_flush_memslot(struct kvm *kvm, @@ -338,7 +344,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush); + kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); } /** @@ -361,6 +367,8 @@ static void stage2_flush_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_flush_memslot(kvm, memslot); + kvm_nested_s2_flush(kvm); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } @@ -803,13 +811,13 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) struct kvm_pgtable pgt = { .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = vabits_actual, - .start_level = (KVM_PGTABLE_MAX_LEVELS - - CONFIG_PGTABLE_LEVELS), + .start_level = (KVM_PGTABLE_LAST_LEVEL - + CONFIG_PGTABLE_LEVELS + 1), .mm_ops = &kvm_user_mm_ops, }; unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ - u32 level = ~0; + s8 level = S8_MAX; int ret; /* @@ -828,7 +836,9 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) * Not seeing an error, but not updating level? Something went * deeply wrong... */ - if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) + return -EFAULT; + if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) return -EFAULT; /* Oops, the userspace PTs are gone... Replay the fault */ @@ -852,21 +862,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; -/** - * kvm_init_stage2_mmu - Initialise a S2 MMU structure - * @kvm: The pointer to the KVM structure - * @mmu: The pointer to the s2 MMU structure - * @type: The machine type of the virtual machine - * - * Allocates only the stage-2 HW PGD level table(s). - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); - int cpu, err; - struct kvm_pgtable *pgt; u64 mmfr0, mmfr1; u32 phys_shift; @@ -893,11 +891,51 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + return 0; +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called in two cases: + * + * - when the VM is created, which can't race against anything + * + * - when secondary kvm_s2_mmu structures are initialised for NV + * guests, and the caller must hold kvm->lock as this is called on a + * per-vcpu basis. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + + /* + * If we already have our page tables in place, and that the + * MMU context is the canonical one, we have a bug somewhere, + * as this is only supposed to ever happen once per VM. + * + * Otherwise, we're building nested page tables, and that's + * probably because userspace called KVM_ARM_VCPU_INIT more + * than once on the same vcpu. Since that's actually legal, + * don't kick a fuss and leave gracefully. + */ if (mmu->pgt != NULL) { + if (kvm_is_nested_s2_mmu(kvm, mmu)) + return 0; + kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + err = kvm_init_ipa_range(mmu, type); + if (err) + return err; + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; @@ -922,6 +960,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + return 0; out_destroy_pgtable: @@ -973,7 +1015,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); @@ -1000,6 +1042,8 @@ void stage2_unmap_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); + kvm_nested_s2_unmap(kvm, true); + write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); srcu_read_unlock(&kvm->srcu, idx); @@ -1017,6 +1061,10 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + write_unlock(&kvm->mmu_lock); if (pgt) { @@ -1099,12 +1147,12 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, } /** - * stage2_wp_range() - write protect stage2 memory region range + * kvm_stage2_wp_range() - write protect stage2 memory region range * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ -static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); } @@ -1135,7 +1183,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_nested_s2_wp(kvm); write_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs_memslot(kvm, memslot); } @@ -1189,7 +1238,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, lockdep_assert_held_write(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); /* * Eager-splitting is done when manual-protect is set. We @@ -1201,6 +1250,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_split_huge_pages(kvm, start, end); + + kvm_nested_s2_wp(kvm); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) @@ -1404,14 +1455,16 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) + bool fault_is_perm) { int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault, mte_allowed, is_vma_cacheable; bool s2_force_noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; @@ -1419,18 +1472,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; vm_flags_t vm_flags; - fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); + if (fault_is_perm) + fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { + if (fault_is_perm && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1441,8 +1494,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { + if (!fault_is_perm || (logging_active && write_fault)) { ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); if (ret) @@ -1497,10 +1549,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = 1UL << vma_shift; - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + + if (nested) { + unsigned long max_map_size; + + max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; + + ipa = kvm_s2_trans_output(nested); + + /* + * If we're about to create a shadow stage 2 entry, then we + * can only create a block mapping if the guest stage 2 page + * table uses at least as big a mapping. + */ + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + + /* + * Be careful that if the mapping size falls between + * two host sizes, take the smallest of the two. + */ + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) + max_map_size = PMD_SIZE; + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) + max_map_size = PAGE_SIZE; + + force_pte = (max_map_size == PAGE_SIZE); + vma_pagesize = min(vma_pagesize, (long)max_map_size); + } + + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } - gfn = fault_ipa >> PAGE_SHIFT; + gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; @@ -1576,18 +1663,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && s2_force_noncacheable) return -ENOEXEC; + /* + * Potentially reduce shadow S2 permissions to match the guest's own + * S2. For exec faults, we'd only reach this point if the guest + * actually allowed it (see kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits + * of the leaf entry as a proxy for the span of that translation. + * This will be retrieved on TLB invalidation from the guest and + * used to limit the invalidation scope if a TTL hint or a range + * isn't provided. + */ + if (nested) { + writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + prot &= ~KVM_PGTABLE_PROT_R; + + prot |= kvm_encode_nested_level(nested); + } + read_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; goto out_unlock; + } /* * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { - if (fault_status == ESR_ELx_FSC_PERM && - fault_granule > PAGE_SIZE) + if (fault_is_perm && fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, @@ -1600,7 +1707,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } } - if (fault_status != ESR_ELx_FSC_PERM && !s2_force_noncacheable && kvm_has_mte(kvm)) { + if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); @@ -1621,7 +1728,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) { + } else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC) && + (!nested || kvm_s2_trans_executable(nested))) { prot |= KVM_PGTABLE_PROT_X; } @@ -1630,23 +1738,29 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) + if (fault_is_perm && vma_pagesize == fault_granule) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - else + } else { ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, memcache, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); + } +out_unlock: + read_unlock(&kvm->mmu_lock); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { kvm_set_pfn_dirty(pfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } -out_unlock: - read_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } @@ -1681,20 +1795,22 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { - unsigned long fault_status; - phys_addr_t fault_ipa; + struct kvm_s2_trans nested_trans, *nested = NULL; + unsigned long esr; + phys_addr_t fault_ipa; /* The address we faulted on */ + phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ struct kvm_memory_slot *memslot; unsigned long hva; bool is_iabt, write_fault, writable; gfn_t gfn; int ret, idx; - fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + esr = kvm_vcpu_get_esr(vcpu); - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (fault_status == ESR_ELx_FSC_FAULT) { + if (esr_fsc_is_translation_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1729,9 +1845,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != ESR_ELx_FSC_FAULT && - fault_status != ESR_ELx_FSC_PERM && - fault_status != ESR_ELx_FSC_ACCESS) { + if (!esr_fsc_is_translation_fault(esr) && + !esr_fsc_is_permission_fault(esr) && + !esr_fsc_is_access_flag_fault(esr)) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1741,7 +1857,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + /* + * We may have faulted on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we simply inject + * this to the guest and carry on. + * + * If there are no shadow S2 PTs because S2 is disabled, there is + * nothing to walk and we treat it as a 1:1 before going through the + * canonical translation. + */ + if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && + vcpu->arch.hw_mmu->nested_stage2_enabled) { + u32 esr; + + ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + gfn = ipa >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1785,21 +1936,22 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + ret = io_mem_abort(vcpu, ipa); goto out_unlock; } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); + VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); - if (fault_status == ESR_ELx_FSC_ACCESS) { + if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, + esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: @@ -1821,6 +1973,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) (range->end - range->start) << PAGE_SHIFT, range->may_block); + kvm_nested_s2_unmap(kvm, range->may_block); return false; } @@ -1868,6 +2021,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); + /* + * TODO: Handle nested_mmu structures here using the reverse mapping in + * a later version of patch series. + */ } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -2128,11 +2285,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_uninit_stage2_mmu(kvm); -} - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -2140,7 +2292,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index ced30c90521a02713a4e0e06c1d7b1430df55ea6..fa98e065f1ec9a49cb71b2f48d5278dd4ddb7724 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -4,18 +4,819 @@ * Author: Jintack Lim */ +#include #include #include +#include #include +#include #include #include #include "sys_regs.h" -/* Protection against the sysreg repainting madness... */ -#define NV_FTR(r, f) ID_AA64##r##_EL1_##f +/* + * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between + * memory usage and potential number of different sets of S2 PTs in + * the guests. Running out of S2 MMUs only affects performance (we + * will invalidate them more often). + */ +#define S2_MMU_PER_VCPU 2 + +void kvm_init_nested(struct kvm *kvm) +{ + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; +} + +static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +{ + /* + * We only initialise the IPA range on the canonical MMU, which + * defines the contract between KVM and userspace on where the + * "hardware" is in the IPA space. This affects the validity of MMIO + * exits forwarded to userspace, for example. + * + * For nested S2s, we use the PARange as exposed to the guest, as it + * is allowed to use it at will to expose whatever memory map it + * wants to its own guests as it would be on real HW. + */ + return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm)); +} + +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *tmp; + int num_mmus, ret = 0; + + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) && + !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + return -EINVAL; + + /* + * Let's treat memory allocation failures as benign: If we fail to + * allocate anything, return an error and keep the allocated array + * alive. Userspace may try to recover by intializing the vcpu + * again, and there is no reason to affect the whole VM for this. + */ + num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; + tmp = kvrealloc(kvm->arch.nested_mmus, + size_mul(sizeof(*kvm->arch.nested_mmus), kvm->arch.nested_mmus_size), + size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!tmp) + return -ENOMEM; + + swap(kvm->arch.nested_mmus, tmp); + + /* + * If we went through a realocation, adjust the MMU back-pointers in + * the previously initialised kvm_pgtable structures. + */ + if (kvm->arch.nested_mmus != tmp) + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; + + for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); + + if (ret) { + for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); + + return ret; + } + + kvm->arch.nested_mmus_size = num_mmus; + + return 0; +} + +struct s2_walk_info { + int (*read_desc)(phys_addr_t pa, u64 *desc, void *data); + void *data; + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; +}; + +static u32 compute_fsc(int level, u32 fsc) +{ + return fsc | (level & 0x3); +} + +static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc) +{ + u32 esr; + + esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC; + esr |= compute_fsc(level, fsc); + return esr; +} + +static int get_ia_size(struct s2_walk_info *wi) +{ + return 64 - wi->t0sz; +} + +static int check_base_s2_limits(struct s2_walk_info *wi, + int level, int input_size, int stride) +{ + int start_size, ia_size; + + ia_size = get_ia_size(wi); + + /* Check translation limits */ + switch (BIT(wi->pgshift)) { + case SZ_64K: + if (level == 0 || (level == 1 && ia_size <= 42)) + return -EFAULT; + break; + case SZ_16K: + if (level == 0 || (level == 1 && ia_size <= 40)) + return -EFAULT; + break; + case SZ_4K: + if (level < 0 || (level == 0 && ia_size <= 42)) + return -EFAULT; + break; + } + + /* Check input size limits */ + if (input_size > ia_size) + return -EFAULT; + + /* Check number of entries in starting level table */ + start_size = input_size - ((3 - level) * stride + wi->pgshift); + if (start_size < 1 || start_size > stride + 4) + return -EFAULT; + + return 0; +} + +/* Check if output is within boundaries */ +static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) +{ + unsigned int output_size = wi->max_oa_bits; + + if (output_size != 48 && (output & GENMASK_ULL(47, output_size))) + return -1; + + return 0; +} + +/* + * This is essentially a C-version of the pseudo code from the ARM ARM + * AArch64.TranslationTableWalk function. I strongly recommend looking at + * that pseudocode in trying to understand this. + * + * Must be called with the kvm->srcu read lock held + */ +static int walk_nested_s2_pgd(phys_addr_t ipa, + struct s2_walk_info *wi, struct kvm_s2_trans *out) +{ + int first_block_level, level, stride, input_size, base_lower_bound; + phys_addr_t base_addr; + unsigned int addr_top, addr_bottom; + u64 desc; /* page table entry */ + int ret; + phys_addr_t paddr; + + switch (BIT(wi->pgshift)) { + default: + case SZ_64K: + case SZ_16K: + level = 3 - wi->sl; + first_block_level = 2; + break; + case SZ_4K: + level = 2 - wi->sl; + first_block_level = 1; + break; + } + + stride = wi->pgshift - 3; + input_size = get_ia_size(wi); + if (input_size > 48 || input_size < 25) + return -EFAULT; + + ret = check_base_s2_limits(wi, level, input_size, stride); + if (WARN_ON(ret)) + return ret; + + base_lower_bound = 3 + input_size - ((3 - level) * stride + + wi->pgshift); + base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound); + + if (check_output_size(wi, base_addr)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + return 1; + } + + addr_top = input_size - 1; + + while (1) { + phys_addr_t index; + + addr_bottom = (3 - level) * stride + wi->pgshift; + index = (ipa & GENMASK_ULL(addr_top, addr_bottom)) + >> (addr_bottom - 3); + + paddr = base_addr | index; + ret = wi->read_desc(paddr, &desc, wi->data); + if (ret < 0) + return ret; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Check for valid descriptor at this point */ + if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level or block translation level */ + if ((desc & 3) == 1 || level == 3) + break; + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + base_addr = desc & GENMASK_ULL(47, wi->pgshift); + + level += 1; + addr_top = addr_bottom - 1; + } + + if (level < first_block_level) { + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + if (check_output_size(wi, desc)) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); + out->desc = desc; + return 1; + } + + if (!(desc & BIT(10))) { + out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); + out->desc = desc; + return 1; + } + + addr_bottom += contiguous_bit_shift(desc, wi, level); + + /* Calculate and return the result */ + paddr = (desc & GENMASK_ULL(47, addr_bottom)) | + (ipa & GENMASK_ULL(addr_bottom - 1, 0)); + out->output = paddr; + out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); + out->readable = desc & (0b01 << 6); + out->writable = desc & (0b10 << 6); + out->level = level; + out->desc = desc; + return 0; +} + +static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) +{ + struct kvm_vcpu *vcpu = data; + + return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); +} + +static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) +{ + wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + wi->pgshift = 12; break; + case VTCR_EL2_TG0_16K: + wi->pgshift = 14; break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + + wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + /* Global limit for now, should eventually be per-VM */ + wi->max_oa_bits = min(get_kvm_ipa_limit(), + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr))); +} + +int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, + struct kvm_s2_trans *result) +{ + u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + struct s2_walk_info wi; + int ret; + + result->esr = 0; + + if (!vcpu_has_nv(vcpu)) + return 0; + + wi.read_desc = read_guest_s2_desc; + wi.data = vcpu; + wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + vtcr_to_walk_info(vtcr, &wi); + + wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; + + ret = walk_nested_s2_pgd(gipa, &wi, result); + if (ret) + result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); + + return ret; +} + +static unsigned int ttl_to_size(u8 ttl) +{ + int level = ttl & 3; + int gran = (ttl >> 2) & 3; + unsigned int max_size = 0; + + switch (gran) { + case TLBI_TTL_TG_4K: + switch (level) { + case 0: + break; + case 1: + max_size = SZ_1G; + break; + case 2: + max_size = SZ_2M; + break; + case 3: + max_size = SZ_4K; + break; + } + break; + case TLBI_TTL_TG_16K: + switch (level) { + case 0: + case 1: + break; + case 2: + max_size = SZ_32M; + break; + case 3: + max_size = SZ_16K; + break; + } + break; + case TLBI_TTL_TG_64K: + switch (level) { + case 0: + case 1: + /* No 52bit IPA support */ + break; + case 2: + max_size = SZ_512M; + break; + case 3: + max_size = SZ_64K; + break; + } + break; + default: /* No size information */ + break; + } + + return max_size; +} +/* + * Compute the equivalent of the TTL field by parsing the shadow PT. The + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is + * retrieved from first entry carrying the level as a tag. + */ +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) +{ + u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr; + kvm_pte_t pte; + u8 ttl, level; + + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); + + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + ttl = (TLBI_TTL_TG_4K << 2); + break; + case VTCR_EL2_TG0_16K: + ttl = (TLBI_TTL_TG_16K << 2); + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + ttl = (TLBI_TTL_TG_64K << 2); + break; + } + + tmp = addr; + +again: + /* Iteratively compute the block sizes for a particular granule size */ + switch (vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + if (sz < SZ_4K) sz = SZ_4K; + else if (sz < SZ_2M) sz = SZ_2M; + else if (sz < SZ_1G) sz = SZ_1G; + else sz = 0; + break; + case VTCR_EL2_TG0_16K: + if (sz < SZ_16K) sz = SZ_16K; + else if (sz < SZ_32M) sz = SZ_32M; + else sz = 0; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (sz < SZ_64K) sz = SZ_64K; + else if (sz < SZ_512M) sz = SZ_512M; + else sz = 0; + break; + } + + if (sz == 0) + return 0; + + tmp &= ~(sz - 1); + if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL)) + goto again; + if (!(pte & PTE_VALID)) + goto again; + level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte); + if (!level) + goto again; + + ttl |= level; + + /* + * We now have found some level information in the shadow S2. Check + * that the resulting range is actually including the original IPA. + */ + sz = ttl_to_size(ttl); + if (addr < (tmp + sz)) + return ttl; + + return 0; +} + +unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); + unsigned long max_size; + u8 ttl; + + ttl = FIELD_GET(TLBI_TTL_MASK, val); + + if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) { + /* No TTL, check the shadow S2 for a hint */ + u64 addr = (val & GENMASK_ULL(35, 0)) << 12; + ttl = get_guest_mapping_ttl(mmu, addr); + } + + max_size = ttl_to_size(ttl); + + if (!max_size) { + /* Compute the maximum extent of the invalidation */ + switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) { + case VTCR_EL2_TG0_4K: + max_size = SZ_1G; + break; + case VTCR_EL2_TG0_16K: + max_size = SZ_32M; + break; + case VTCR_EL2_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + /* + * No, we do not support 52bit IPA in nested yet. Once + * we do, this should be 4TB. + */ + max_size = SZ_512M; + break; + } + } + + WARN_ON(!max_size); + return max_size; +} + +/* + * We can have multiple *different* MMU contexts with the same VMID: + * + * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit + * + * - Multiple vcpus using private S2s (huh huh...), hence differing by the + * VBBTR_EL2.BADDR address + * + * - A combination of the above... + * + * We can always identify which MMU context to pick at run-time. However, + * TLB invalidation involving a VMID must take action on all the TLBs using + * this particular VMID. This translates into applying the same invalidation + * operation to all the contexts that are using this VMID. Moar phun! + */ +void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, + const union tlbi_info *info, + void (*tlbi_callback)(struct kvm_s2_mmu *, + const union tlbi_info *)) +{ + write_lock(&kvm->mmu_lock); + + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (vmid == get_vmid(mmu->tlb_vttbr)) + tlbi_callback(mmu, info); + } + + write_unlock(&kvm->mmu_lock); +} + +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + bool nested_stage2_enabled; + u64 vttbr, vtcr, hcr; + + lockdep_assert_held_write(&kvm->mmu_lock); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + hcr = vcpu_read_sys_reg(vcpu, HCR_EL2); + + nested_stage2_enabled = hcr & HCR_VM; + + /* Don't consider the CnP bit for the vttbr match */ + vttbr &= ~VTTBR_CNP_BIT; + + /* + * Two possibilities when looking up a S2 MMU context: + * + * - either S2 is enabled in the guest, and we need a context that is + * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR, + * which makes it safe from a TLB conflict perspective (a broken + * guest won't be able to generate them), + * + * - or S2 is disabled, and we need a context that is S2-disabled + * and matches the VMID only, as all TLBs are tagged by VMID even + * if S2 translation is disabled. + */ + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (nested_stage2_enabled && + mmu->nested_stage2_enabled && + vttbr == mmu->tlb_vttbr && + vtcr == mmu->tlb_vtcr) + return mmu; + + if (!nested_stage2_enabled && + !mmu->nested_stage2_enabled && + get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr)) + return mmu; + } + return NULL; +} + +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *s2_mmu; + int i; + + lockdep_assert_held_write(&vcpu->kvm->mmu_lock); + + s2_mmu = lookup_s2_mmu(vcpu); + if (s2_mmu) + goto out; + + /* + * Make sure we don't always search from the same point, or we + * will always reuse a potentially active context, leaving + * free contexts unused. + */ + for (i = kvm->arch.nested_mmus_next; + i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next); + i++) { + s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size]; + + if (atomic_read(&s2_mmu->refcnt) == 0) + break; + } + BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */ + + /* Set the scene for the next search */ + kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; + + /* Make sure we don't forget to do the laundry */ + if (kvm_s2_mmu_valid(s2_mmu)) + s2_mmu->pending_unmap = true; + + /* + * The virtual VMID (modulo CnP) will be used as a key when matching + * an existing kvm_s2_mmu. + * + * We cache VTCR at allocation time, once and for all. It'd be great + * if the guest didn't screw that one up, as this is not very + * forgiving... + */ + s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT; + s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM; + +out: + atomic_inc(&s2_mmu->refcnt); + + /* + * Set the vCPU request to perform an unmap, even if the pending unmap + * originates from another vCPU. This guarantees that the MMU has been + * completely unmapped before any vCPU actually uses it, and allows + * multiple vCPUs to lend a hand with completing the unmap. + */ + if (s2_mmu->pending_unmap) + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); + + return s2_mmu; +} + +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) +{ + /* CnP being set denotes an invalid entry */ + mmu->tlb_vttbr = VTTBR_CNP_BIT; + mmu->nested_stage2_enabled = false; + atomic_set(&mmu->refcnt, 0); +} + +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* + * The vCPU kept its reference on the MMU after the last put, keep + * rolling with it. + */ + if (vcpu->arch.hw_mmu) + return; + + if (is_hyp_ctxt(vcpu)) { + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + } else { + write_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + write_unlock(&vcpu->kvm->mmu_lock); + } +} + +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) +{ + /* + * Keep a reference on the associated stage-2 MMU if the vCPU is + * scheduling out and not in WFI emulation, suggesting it is likely to + * reuse the MMU sometime soon. + */ + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) + return; + + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) + atomic_dec(&vcpu->arch.hw_mmu->refcnt); + + vcpu->arch.hw_mmu = NULL; +} + +/* + * Returns non-zero if permission fault is handled by injecting it to the next + * level hypervisor. + */ +int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) +{ + bool forward_fault = false; + + trans->esr = 0; + + if (!kvm_vcpu_trap_is_permission_fault(vcpu)) + return 0; + + if (kvm_vcpu_trap_is_iabt(vcpu)) { + forward_fault = !kvm_s2_trans_executable(trans); + } else { + bool write_fault = kvm_is_write_fault(vcpu); + + forward_fault = ((write_fault && !trans->writable) || + (!write_fault && !trans->readable)); + } + + if (forward_fault) + trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM); + + return forward_fault; +} + +int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2); + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2); + + return kvm_inject_nested_sync(vcpu, esr_el2); +} + +void kvm_nested_s2_wp(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu)); + } +} + +void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); + } +} + +void kvm_nested_s2_flush(struct kvm *kvm) +{ + int i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (kvm_s2_mmu_valid(mmu)) + kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu)); + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!WARN_ON(atomic_read(&mmu->refcnt))) + kvm_free_stage2_pgd(mmu); + } + kvfree(kvm->arch.nested_mmus); + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + kvm_uninit_stage2_mmu(kvm); +} + +#define has_tgran_2(__r, __sz) \ + ({ \ + u64 _s1, _s2, _mmfr0 = __r; \ + \ + _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz##_2, _mmfr0); \ + \ + _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz, _mmfr0); \ + \ + ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \ + _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \ + (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \ + _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \ + }) /* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number @@ -23,156 +824,167 @@ * This list should get updated as new features get added to the NV * support, and new extension to the architecture. */ -static u64 limit_nv_id_reg(u32 id, u64 val) +u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) { - u64 tmp; + u64 orig_val = val; - switch (id) { + switch (reg) { case SYS_ID_AA64ISAR0_EL1: - /* Support everything but TME, O.S. and Range TLBIs */ - val &= ~(NV_FTR(ISAR0, TLB) | - NV_FTR(ISAR0, TME)); + /* Support everything but TME */ + val &= ~ID_AA64ISAR0_EL1_TME; break; case SYS_ID_AA64ISAR1_EL1: - /* Support everything but PtrAuth and Spec Invalidation */ - val &= ~(GENMASK_ULL(63, 56) | - NV_FTR(ISAR1, SPECRES) | - NV_FTR(ISAR1, GPI) | - NV_FTR(ISAR1, GPA) | - NV_FTR(ISAR1, API) | - NV_FTR(ISAR1, APA)); + /* Support everything but LS64 and Spec Invalidation */ + val &= ~(ID_AA64ISAR1_EL1_LS64 | + ID_AA64ISAR1_EL1_SPECRES); break; case SYS_ID_AA64PFR0_EL1: - /* No AMU, MPAM, S-EL2, RAS or SVE */ - val &= ~(GENMASK_ULL(55, 52) | - NV_FTR(PFR0, AMU) | - NV_FTR(PFR0, MPAM) | - NV_FTR(PFR0, SEL2) | - NV_FTR(PFR0, RAS) | - NV_FTR(PFR0, SVE) | - NV_FTR(PFR0, EL3) | - NV_FTR(PFR0, EL2) | - NV_FTR(PFR0, EL1)); - /* 64bit EL1/EL2/EL3 only */ - val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001); - val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001); + /* No RME, AMU, MPAM, S-EL2, or RAS */ + val &= ~(ID_AA64PFR0_EL1_RME | + ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SEL2 | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_EL3 | + ID_AA64PFR0_EL1_EL2 | + ID_AA64PFR0_EL1_EL1 | + ID_AA64PFR0_EL1_EL0); + /* 64bit only at any EL */ + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP); + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP); break; case SYS_ID_AA64PFR1_EL1: - /* Only support SSBS */ - val &= NV_FTR(PFR1, SSBS); + /* Only support BTI, SSBS, CSV2_frac */ + val &= (ID_AA64PFR1_EL1_BT | + ID_AA64PFR1_EL1_SSBS | + ID_AA64PFR1_EL1_CSV2_frac); break; case SYS_ID_AA64MMFR0_EL1: - /* Hide ECV, ExS, Secure Memory */ - val &= ~(NV_FTR(MMFR0, ECV) | - NV_FTR(MMFR0, EXS) | - NV_FTR(MMFR0, TGRAN4_2) | - NV_FTR(MMFR0, TGRAN16_2) | - NV_FTR(MMFR0, TGRAN64_2) | - NV_FTR(MMFR0, SNSMEM)); + /* Hide ExS, Secure Memory */ + val &= ~(ID_AA64MMFR0_EL1_EXS | + ID_AA64MMFR0_EL1_TGRAN4_2 | + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_TGRAN64_2 | + ID_AA64MMFR0_EL1_SNSMEM); + + /* Hide CNTPOFF if present */ + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP); /* Disallow unsupported S2 page sizes */ switch (PAGE_SIZE) { case SZ_64K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI); fallthrough; case SZ_16K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI); fallthrough; case SZ_4K: /* Support everything */ break; } + /* - * Since we can't support a guest S2 page size smaller than - * the host's own page size (due to KVM only populating its - * own S2 using the kernel's page size), advertise the - * limitation using FEAT_GTG. + * Since we can't support a guest S2 page size smaller + * than the host's own page size (due to KVM only + * populating its own S2 using the kernel's page + * size), advertise the limitation using FEAT_GTG. */ switch (PAGE_SIZE) { case SZ_4K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010); + if (has_tgran_2(orig_val, 4)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); fallthrough; case SZ_16K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010); + if (has_tgran_2(orig_val, 16)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); fallthrough; case SZ_64K: - val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010); + if (has_tgran_2(orig_val, 64)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); break; } + /* Cap PARange to 48bits */ - tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val); - if (tmp > 0b0101) { - val &= ~NV_FTR(MMFR0, PARANGE); - val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101); - } + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48); break; case SYS_ID_AA64MMFR1_EL1: - val &= (NV_FTR(MMFR1, HCX) | - NV_FTR(MMFR1, PAN) | - NV_FTR(MMFR1, LO) | - NV_FTR(MMFR1, HPDS) | - NV_FTR(MMFR1, VH) | - NV_FTR(MMFR1, VMIDBits)); + val &= (ID_AA64MMFR1_EL1_HCX | + ID_AA64MMFR1_EL1_PAN | + ID_AA64MMFR1_EL1_LO | + ID_AA64MMFR1_EL1_HPDS | + ID_AA64MMFR1_EL1_VH | + ID_AA64MMFR1_EL1_VMIDBits); + /* FEAT_E2H0 implies no VHE */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) + val &= ~ID_AA64MMFR1_EL1_VH; break; case SYS_ID_AA64MMFR2_EL1: - val &= ~(NV_FTR(MMFR2, BBM) | - NV_FTR(MMFR2, TTL) | + val &= ~(ID_AA64MMFR2_EL1_BBM | + ID_AA64MMFR2_EL1_TTL | GENMASK_ULL(47, 44) | - NV_FTR(MMFR2, ST) | - NV_FTR(MMFR2, CCIDX) | - NV_FTR(MMFR2, VARange)); + ID_AA64MMFR2_EL1_ST | + ID_AA64MMFR2_EL1_CCIDX | + ID_AA64MMFR2_EL1_VARange); /* Force TTL support */ - val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP); break; case SYS_ID_AA64MMFR4_EL1: - val = 0; - if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) - val |= FIELD_PREP(NV_FTR(MMFR4, E2H0), - ID_AA64MMFR4_EL1_E2H0_NI_NV1); + /* + * You get EITHER + * + * - FEAT_VHE without FEAT_E2H0 + * - FEAT_NV limited to FEAT_NV2 + * - HCR_EL2.NV1 being RES0 + * + * OR + * + * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV + * + * Life is too short for anything else. + */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) { + val = 0; + } else { + val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY); + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1); + } break; case SYS_ID_AA64DFR0_EL1: - /* Only limited support for PMU, Debug, BPs and WPs */ - val &= (NV_FTR(DFR0, PMUVer) | - NV_FTR(DFR0, WRPs) | - NV_FTR(DFR0, BRPs) | - NV_FTR(DFR0, DebugVer)); + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ + val &= (ID_AA64DFR0_EL1_PMUVer | + ID_AA64DFR0_EL1_WRPs | + ID_AA64DFR0_EL1_BRPs | + ID_AA64DFR0_EL1_DebugVer| + ID_AA64DFR0_EL1_HPMN0); /* Cap Debug to ARMv8.1 */ - tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val); - if (tmp > 0b0111) { - val &= ~NV_FTR(DFR0, DebugVer); - val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111); - } - break; - - default: - /* Unknown register, just wipe it clean */ - val = 0; + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE); break; } return val; } -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg sr, u64 v) { - u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr); struct kvm_sysreg_masks *masks; masks = vcpu->kvm->arch.sysreg_masks; if (masks) { - sr -= __VNCR_START__; + sr -= __SANITISED_REG_START__; v &= ~masks->mask[sr].res0; v |= masks->mask[sr].res1; @@ -181,34 +993,32 @@ u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) return v; } -static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) +static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) { - int i = sr - __VNCR_START__; + int i = sr - __SANITISED_REG_START__; + + BUILD_BUG_ON(!__builtin_constant_p(sr)); + BUILD_BUG_ON(sr < __SANITISED_REG_START__); + BUILD_BUG_ON(sr >= NR_SYS_REGS); kvm->arch.sysreg_masks->mask[i].res0 = res0; kvm->arch.sysreg_masks->mask[i].res1 = res1; } -int kvm_init_nv_sysregs(struct kvm *kvm) +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; u64 res0, res1; - int ret = 0; - mutex_lock(&kvm->arch.config_lock); + lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) goto out; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), - GFP_KERNEL); - if (!kvm->arch.sysreg_masks) { - ret = -ENOMEM; - goto out; - } - - for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++) - kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i), - kvm->arch.id_regs[i]); + GFP_KERNEL_ACCOUNT); + if (!kvm->arch.sysreg_masks) + return -ENOMEM; /* VTTBR_EL2 */ res0 = res1 = 0; @@ -248,10 +1058,11 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCR_FIEN; if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP)) res0 |= HCR_FWB; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)) - res0 |= HCR_NV2; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) - res0 |= (HCR_AT | HCR_NV1 | HCR_NV); + /* Implementation choice: NV2 is the only supported config */ + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + res0 |= (HCR_NV2 | HCR_NV | HCR_AT); + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI)) + res0 |= HCR_NV1; if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HCR_API | HCR_APK); @@ -261,6 +1072,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HCR_TEA | HCR_TERR); if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) res0 |= HCR_TLOR; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= HCR_E2H; if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP)) res1 |= HCR_E2H; set_sysreg_masks(kvm, HCR_EL2, res0, res1); @@ -435,8 +1248,83 @@ int kvm_init_nv_sysregs(struct kvm *kvm) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + + /* SCTLR_EL1 */ + res0 = SCTLR_EL1_RES0; + res1 = SCTLR_EL1_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + res0 |= SCTLR_EL1_EPAN; + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + /* MDCR_EL2 */ + res0 = MDCR_EL2_RES0; + res1 = MDCR_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) + res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR | + MDCR_EL2_TPM | MDCR_EL2_HPME); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) + res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP)) + res0 |= MDCR_EL2_EnSPM; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1)) + res0 |= MDCR_EL2_HPMD; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + res0 |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) + res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) + res0 |= MDCR_EL2_E2TB; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + res0 |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + res0 |= MDCR_EL2_MTPME; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7)) + res0 |= MDCR_EL2_HPMFZO; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP)) + res0 |= MDCR_EL2_PMSSE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) + res0 |= MDCR_EL2_HPMFZS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP)) + res0 |= MDCR_EL2_PMEE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9)) + res0 |= MDCR_EL2_EBWE; + if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP)) + res0 |= MDCR_EL2_EnSTEPOP; + set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + + /* CNTHCTL_EL2 */ + res0 = GENMASK(63, 20); + res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) + res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { + res0 |= CNTHCTL_ECV; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) + res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + } + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + out: - mutex_unlock(&kvm->arch.config_lock); + for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) + (void)__vcpu_sys_reg(vcpu, sr); - return ret; + return 0; +} + +void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + + write_lock(&vcpu->kvm->mmu_lock); + if (mmu->pending_unmap) { + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); + mmu->pending_unmap = false; + } + write_unlock(&vcpu->kvm->mmu_lock); + } } diff --git a/arch/arm64/kvm/pauth.c b/arch/arm64/kvm/pauth.c new file mode 100644 index 0000000000000000000000000000000000000000..d5eb3ae876be4f58f99dda387d12c1a5b38dd53e --- /dev/null +++ b/arch/arm64/kvm/pauth.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 - Google LLC + * Author: Marc Zyngier + * + * Primitive PAuth emulation for ERETAA/ERETAB. + * + * This code assumes that is is run from EL2, and that it is part of + * the emulation of ERETAx for a guest hypervisor. That's a lot of + * baked-in assumptions and shortcuts. + * + * Do no reuse for anything else! + */ + +#include + +#include +#include +#include + +/* PACGA Xd, Xn, Xm */ +#define PACGA(d,n,m) \ + asm volatile(__DEFINE_ASM_GPR_NUMS \ + ".inst 0x9AC03000 |" \ + "(.L__gpr_num_%[Rd] << 0) |" \ + "(.L__gpr_num_%[Rn] << 5) |" \ + "(.L__gpr_num_%[Rm] << 16)\n" \ + : [Rd] "=r" ((d)) \ + : [Rn] "r" ((n)), [Rm] "r" ((m))) + +static u64 compute_pac(struct kvm_vcpu *vcpu, u64 ptr, + struct ptrauth_key ikey) +{ + struct ptrauth_key gkey; + u64 mod, pac = 0; + + preempt_disable(); + + if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) + mod = __vcpu_sys_reg(vcpu, SP_EL2); + else + mod = read_sysreg(sp_el1); + + gkey.lo = read_sysreg_s(SYS_APGAKEYLO_EL1); + gkey.hi = read_sysreg_s(SYS_APGAKEYHI_EL1); + + __ptrauth_key_install_nosync(APGA, ikey); + isb(); + + PACGA(pac, ptr, mod); + isb(); + + __ptrauth_key_install_nosync(APGA, gkey); + + preempt_enable(); + + /* PAC in the top 32bits */ + return pac; +} + +static bool effective_tbi(struct kvm_vcpu *vcpu, bool bit55) +{ + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + bool tbi, tbid; + + /* + * Since we are authenticating an instruction address, we have + * to take TBID into account. If E2H==0, ignore VA[55], as + * TCR_EL2 only has a single TBI/TBID. If VA[55] was set in + * this case, this is likely a guest bug... + */ + if (!vcpu_el2_e2h_is_set(vcpu)) { + tbi = tcr & BIT(20); + tbid = tcr & BIT(29); + } else if (bit55) { + tbi = tcr & TCR_TBI1; + tbid = tcr & TCR_TBID1; + } else { + tbi = tcr & TCR_TBI0; + tbid = tcr & TCR_TBID0; + } + + return tbi && !tbid; +} + +static int compute_bottom_pac(struct kvm_vcpu *vcpu, bool bit55) +{ + static const int maxtxsz = 39; // Revisit these two values once + static const int mintxsz = 16; // (if) we support TTST/LVA/LVA2 + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + int txsz; + + if (!vcpu_el2_e2h_is_set(vcpu) || !bit55) + txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + else + txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + + return 64 - clamp(txsz, mintxsz, maxtxsz); +} + +static u64 compute_pac_mask(struct kvm_vcpu *vcpu, bool bit55) +{ + int bottom_pac; + u64 mask; + + bottom_pac = compute_bottom_pac(vcpu, bit55); + + mask = GENMASK(54, bottom_pac); + if (!effective_tbi(vcpu, bit55)) + mask |= GENMASK(63, 56); + + return mask; +} + +static u64 to_canonical_addr(struct kvm_vcpu *vcpu, u64 ptr, u64 mask) +{ + bool bit55 = !!(ptr & BIT(55)); + + if (bit55) + return ptr | mask; + + return ptr & ~mask; +} + +static u64 corrupt_addr(struct kvm_vcpu *vcpu, u64 ptr) +{ + bool bit55 = !!(ptr & BIT(55)); + u64 mask, error_code; + int shift; + + if (effective_tbi(vcpu, bit55)) { + mask = GENMASK(54, 53); + shift = 53; + } else { + mask = GENMASK(62, 61); + shift = 61; + } + + if (esr_iss_is_eretab(kvm_vcpu_get_esr(vcpu))) + error_code = 2 << shift; + else + error_code = 1 << shift; + + ptr &= ~mask; + ptr |= error_code; + + return ptr; +} + +/* + * Authenticate an ERETAA/ERETAB instruction, returning true if the + * authentication succeeded and false otherwise. In all cases, *elr + * contains the VA to ERET to. Potential exception injection is left + * to the caller. + */ +bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) +{ + u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 ptr, cptr, pac, mask; + struct ptrauth_key ikey; + + *elr = ptr = vcpu_read_sys_reg(vcpu, ELR_EL2); + + /* We assume we're already in the context of an ERETAx */ + if (esr_iss_is_eretab(esr)) { + if (!(sctlr & SCTLR_EL1_EnIB)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIBKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIBKEYHI_EL1); + } else { + if (!(sctlr & SCTLR_EL1_EnIA)) + return true; + + ikey.lo = __vcpu_sys_reg(vcpu, APIAKEYLO_EL1); + ikey.hi = __vcpu_sys_reg(vcpu, APIAKEYHI_EL1); + } + + mask = compute_pac_mask(vcpu, !!(ptr & BIT(55))); + cptr = to_canonical_addr(vcpu, ptr, mask); + + pac = compute_pac(vcpu, cptr, ikey); + + /* + * Slightly deviate from the pseudocode: if we have a PAC + * match with the signed pointer, then it must be good. + * Anything after this point is pure error handling. + */ + if ((pac & mask) == (ptr & mask)) { + *elr = cptr; + return true; + } + + /* + * Authentication failed, corrupt the canonical address if + * PAuth2 isn't implemented, or some XORing if it is. + */ + if (!kvm_has_pauth(vcpu->kvm, PAuth2)) + cptr = corrupt_addr(vcpu, cptr); + else + cptr = ptr ^ (pac & mask); + + *elr = cptr; + return false; +} diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 9095f4ed79db24e7682f54a02da8f33469da0b61..af397c34819e4192335fc644320ca538473d811d 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -220,7 +220,6 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm) int pkvm_init_host_vm(struct kvm *host_kvm) { - mutex_init(&host_kvm->lock); return 0; } @@ -257,6 +256,7 @@ static int __init finalize_pkvm(void) * at, which would end badly once inaccessible. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); ret = pkvm_drop_host_privileges(); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index ea6682ae3d3d130fab8b9c8d7763d0df0cf9375f..65fbfcd1b3394412b8ab0e079c40ac471773f75c 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -48,7 +48,7 @@ static u32 __kvm_pmu_event_mask(unsigned int pmuver) static u32 kvm_pmu_event_mask(struct kvm *kvm) { - u64 dfr0 = IDREG(kvm, SYS_ID_AA64DFR0_EL1); + u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); return __kvm_pmu_event_mask(pmuver); @@ -84,7 +84,11 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc)); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 val = kvm_vcpu_read_pmcr(vcpu); + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); @@ -106,6 +110,11 @@ static u32 counter_index_to_evtreg(u64 idx) return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } +static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) +{ + return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); +} + static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); @@ -239,7 +248,7 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) */ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); int i; for_each_set_bit(i, &mask, 32) @@ -260,7 +269,37 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) irq_work_sync(&vcpu->arch.pmu.overflow_work); } -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + unsigned int hpmn; + + if (!vcpu_has_nv(vcpu) || idx == ARMV8_PMU_CYCLE_IDX) + return false; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + return idx >= hpmn; +} + +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + u64 hpmn; + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + return mask & ~GENMASK(vcpu->kvm->arch.pmcr_n - 1, hpmn); +} + +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); @@ -568,7 +607,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + unsigned long mask = kvm_pmu_accessible_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); @@ -579,8 +618,44 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); + unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) + return false; + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return mdcr & MDCR_EL2_HPME; + + return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; +} + +static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; + bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; + + return u == nsu; +} + +static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; + bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; + + return p == nsk; +} + +static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) + return false; + + return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; } /** @@ -593,17 +668,15 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, reg, data; - bool p, u, nsk, nsu; + u64 eventsel, evtreg; - reg = counter_index_to_evtreg(pmc->idx); - data = __vcpu_sys_reg(vcpu, reg); + evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else - eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed @@ -621,22 +694,25 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; - p = data & ARMV8_PMU_EXCLUDE_EL1; - u = data & ARMV8_PMU_EXCLUDE_EL0; - nsk = data & ARMV8_PMU_EXCLUDE_NS_EL1; - nsu = data & ARMV8_PMU_EXCLUDE_NS_EL0; - memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); - attr.exclude_user = (u != nsu); - attr.exclude_kernel = (p != nsk); + attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; + /* + * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the + * guest's EL2 filter. + */ + if (unlikely(is_hyp_ctxt(vcpu))) + attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); + else + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); + /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period @@ -771,7 +847,7 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); @@ -1095,3 +1171,32 @@ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } + +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) +{ + bool reprogrammed = false; + unsigned long mask; + int i; + + if (!kvm_vcpu_has_pmu(vcpu)) + return; + + mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for_each_set_bit(i, &mask, 32) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + /* + * We only need to reconfigure events where the filter is + * different at EL1 vs. EL2, as we're multiplexing the true EL1 + * event filter bit for nested. + */ + if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) + continue; + + kvm_pmu_create_perf_event(pmc); + reprogrammed = true; + } + + if (reprogrammed) + kvm_vcpu_pmu_restore_guest(vcpu); +} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 953ad65236b5fbeb9a1aa468e877c8fc85b998af..26379905733306828aa4c4c35461e29275751812 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -32,6 +32,7 @@ /* Maximum phys_shift supported for any VM on this host */ static u32 __ro_after_init kvm_ipa_limit; +unsigned int __ro_after_init kvm_host_sve_max_vl; /* * ARMv8 Reset Values @@ -53,6 +54,9 @@ int __init kvm_arm_init_sve(void) if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); kvm_host_sve_max_vl = sve_max_vl(); +#ifndef MODULE + kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl; +#endif /* * The get_sve_reg()/set_sve_reg() ioctl interface will need @@ -75,11 +79,8 @@ int __init kvm_arm_init_sve(void) return 0; } -static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) +static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) { - if (!system_supports_sve()) - return -EINVAL; - vcpu->arch.sve_max_vl = kvm_sve_max_vl; /* @@ -88,8 +89,6 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * kvm_arm_vcpu_finalize(), which freezes the configuration. */ vcpu_set_flag(vcpu, GUEST_HAS_SVE); - - return 0; } /* @@ -158,7 +157,6 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { void *sve_state = vcpu->arch.sve_state; - kvm_vcpu_unshare_task_fp(vcpu); kvm_unshare_hyp(vcpu, vcpu + 1); if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); @@ -172,22 +170,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - /* - * For now make sure that both address/generic pointer authentication - * features are requested by the userspace together and the system - * supports these capabilities. - */ - if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || - !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features) || - !system_has_full_ptr_auth()) - return -EINVAL; - - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); - return 0; -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer @@ -206,10 +188,9 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) * disable preemption around the vcpu reset as we would otherwise race with * preempt notifiers which also call put/load. */ -int kvm_reset_vcpu(struct kvm_vcpu *vcpu) +void kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_reset_state reset_state; - int ret; bool loaded; u32 pstate; @@ -226,29 +207,16 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) if (loaded) kvm_arch_vcpu_put(vcpu); - /* Disallow NV+SVE for the time being */ - if (vcpu_has_nv(vcpu) && vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { - ret = -EINVAL; - goto out; - } - if (!kvm_arm_vcpu_sve_finalized(vcpu)) { - if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) { - ret = kvm_vcpu_enable_sve(vcpu); - if (ret) - goto out; - } + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) + kvm_vcpu_enable_sve(vcpu); } else { kvm_vcpu_reset_sve(vcpu); } - if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || - test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) { - if (kvm_vcpu_enable_ptrauth(vcpu)) { - ret = -EINVAL; - goto out; - } - } + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) + kvm_vcpu_enable_ptrauth(vcpu); if (vcpu_el1_is_32bit(vcpu)) pstate = VCPU_RESET_PSTATE_SVC; @@ -257,11 +225,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) else pstate = VCPU_RESET_PSTATE_EL1; - if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) { - ret = -EINVAL; - goto out; - } - /* Reset core registers */ memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); @@ -296,12 +259,17 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } /* Reset timer */ - ret = kvm_timer_vcpu_reset(vcpu); -out: + kvm_timer_vcpu_reset(vcpu); + if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); - return ret; +} + +u32 kvm_get_pa_bits(struct kvm *kvm) +{ + /* Fixed limit until we can configure ID_AA64MMFR0.PARange */ + return kvm_ipa_limit; } u32 get_kvm_ipa_limit(void) @@ -318,12 +286,11 @@ int __init kvm_set_ipa_limit(void) parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); /* - * IPA size beyond 48 bits could not be supported - * on either 4K or 16K page size. Hence let's cap - * it to 48 bits, in case it's reported as larger - * on the system. + * IPA size beyond 48 bits for 4K and 16K page size is only supported + * when LPA2 is available. So if we have LPA2, enable it, else cap to 48 + * bits, in case it's reported as larger on the system. */ - if (PAGE_SIZE != SZ_64K) + if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K) parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 340ae3c84d9ecd7c8dec05508c06fdb07b6f2222..16ba418d812750174d04bb56cb4cbf0edd3bce9f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -48,6 +48,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, @@ -55,8 +62,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu, { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -104,6 +110,14 @@ static bool get_el2_to_el1_mapping(unsigned int reg, PURE_EL2_SYSREG( RVBAR_EL2 ); PURE_EL2_SYSREG( TPIDR_EL2 ); PURE_EL2_SYSREG( HPFAR_EL2 ); + PURE_EL2_SYSREG( HCRX_EL2 ); + PURE_EL2_SYSREG( HFGRTR_EL2 ); + PURE_EL2_SYSREG( HFGWTR_EL2 ); + PURE_EL2_SYSREG( HFGITR_EL2 ); + PURE_EL2_SYSREG( HDFGRTR_EL2 ); + PURE_EL2_SYSREG( HDFGWTR_EL2 ); + PURE_EL2_SYSREG( HAFGRTR_EL2 ); + PURE_EL2_SYSREG( CNTVOFF_EL2 ); PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); @@ -120,9 +134,14 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); + MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); + MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); + MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); + MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); default: return false; } @@ -141,6 +160,21 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) if (!is_hyp_ctxt(vcpu)) goto memory_read; + /* + * CNTHCTL_EL2 requires some special treatment to + * account for the bits that can be set via CNTKCTL_EL1. + */ + switch (reg) { + case CNTHCTL_EL2: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + } + break; + } + /* * If this register does not have an EL1 counterpart, * then read the stored EL2 version. @@ -158,6 +192,9 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) /* Get the current version of the EL1 counterpart. */ WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + return val; } @@ -191,6 +228,19 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) */ __vcpu_sys_reg(vcpu, reg) = val; + switch (reg) { + case CNTHCTL_EL2: + /* + * If E2H=0, CNHTCTL_EL2 is a pure shadow register. + * Otherwise, some of the bits are backed by + * CNTKCTL_EL1, while the rest is kept in memory. + * Yes, this is fun stuff. + */ + if (vcpu_el2_e2h_is_set(vcpu)) + write_sysreg_el1(val, SYS_CNTKCTL); + return; + } + /* No EL1 counterpart? We're done here.? */ if (reg == el1r) return; @@ -346,10 +396,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_mte(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); @@ -385,6 +433,10 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, bool was_enabled = vcpu_has_cache_enabled(vcpu); u64 val, mask, shift; + if (reg_to_encoding(r) == SYS_TCR2_EL1 && + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + return undef_access(vcpu, p, r); + BUG_ON(!p->is_write); get_access_mask(r, &mask, &shift); @@ -403,6 +455,18 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, return true; } +static bool access_tcr2_el2(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + return access_rw(vcpu, p, r); +} + static bool access_actlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -430,10 +494,8 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; - if (!kvm_has_gicv3(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -478,6 +540,9 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (p->is_write) return ignore_write(vcpu, p); @@ -495,14 +560,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } -static bool trap_undef(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - return false; -} - /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 @@ -515,10 +572,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, { u32 sr = reg_to_encoding(r); - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); @@ -1128,7 +1183,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va { bool set; - val &= kvm_pmu_valid_counter_mask(vcpu); + val &= kvm_pmu_accessible_counter_mask(vcpu); switch (r->reg) { case PMOVSSET_EL0: @@ -1151,7 +1206,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; @@ -1165,7 +1220,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; if (r->Op2 & 0x1) { @@ -1188,7 +1243,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -1212,7 +1267,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; @@ -1242,7 +1297,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_write_swinc_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } @@ -1251,10 +1306,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; @@ -1338,14 +1391,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } -static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - - return false; -} - /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } @@ -1382,30 +1427,149 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, switch (reg) { case SYS_CNTP_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_AARCH32_CNTP_TVAL: + case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; + + case SYS_CNTV_TVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHP_TVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHV_TVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_AARCH32_CNTP_CTL: + case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; + + case SYS_CNTV_CTL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHP_CTL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHV_CTL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_AARCH32_CNTP_CVAL: + case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; + + case SYS_CNTV_CVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHP_CVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHV_CVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + case SYS_AARCH32_CNTPCT: + case SYS_AARCH32_CNTPCTSS: tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; + + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTVCT: + case SYS_AARCH32_CNTVCTSS: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, p, r); } if (p->is_write) @@ -1572,18 +1736,30 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + if (!cpus_have_final_cap(ARM64_HAS_WFXT) || + has_broken_cntvoff()) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS); break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + val &= ~ID_AA64MMFR2_EL1_NV; + break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | + ID_AA64MMFR3_EL1_S1PIE; + + if (!system_supports_poe()) + val &= ~ID_AA64MMFR3_EL1_S1POE; break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; } + if (vcpu_has_nv(vcpu)) + val = limit_nv_id_reg(vcpu->kvm, id, val); + return val; } @@ -1595,20 +1771,37 @@ static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - return IDREG(vcpu->kvm, reg_to_encoding(r)); + return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r)); +} + +static bool is_feature_id_reg(u32 encoding) +{ + return (sys_reg_Op0(encoding) == 3 && + (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && + sys_reg_CRn(encoding) == 0 && + sys_reg_CRm(encoding) <= 7); } /* * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is - * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID + * registers KVM maintains on a per-VM basis. */ -static inline bool is_id_reg(u32 id) +static inline bool is_vm_ftr_id_reg(u32 id) { + if (id == SYS_CTR_EL0) + return true; + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && sys_reg_CRm(id) < 8); } +static inline bool is_vcpu_ftr_id_reg(u32 id) +{ + return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id); +} + static inline bool is_aa32_id_reg(u32 id) { return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && @@ -1714,16 +1907,6 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) return val; } -#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ -({ \ - u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ - (val) &= ~reg##_##field##_MASK; \ - (val) |= FIELD_PREP(reg##_##field##_MASK, \ - min(__f_val, \ - (u64)SYS_FIELD_VALUE(reg, field, limit))); \ - (val); \ -}) - static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); @@ -1854,6 +2037,22 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK; + + /* + * We made the mistake to expose the now deprecated NV field, + * so allow userspace to write it, but silently ignore it. + */ + if ((hw_val & nv_mask) == (user_val & nv_mask)) + user_val &= ~nv_mask; + + return set_id_reg(vcpu, rd, user_val); +} + /* * cpufeature ID register user accessors * @@ -1904,7 +2103,7 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, ret = arm64_check_features(vcpu, rd, val); if (!ret) - IDREG(vcpu->kvm, id) = val; + kvm_set_vm_id_reg(vcpu->kvm, id, val); mutex_unlock(&vcpu->kvm->arch.config_lock); @@ -1920,6 +2119,18 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return ret; } +void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val) +{ + u64 *p = __vm_id_reg(&kvm->arch, reg); + + lockdep_assert_held(&kvm->arch.config_lock); + + if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm)) + return; + + *p = val; +} + static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { @@ -1939,7 +2150,7 @@ static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0); + p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0); return true; } @@ -2112,6 +2323,15 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = v, \ } +#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ + SYS_DESC(SYS_##name), \ + .access = acc, \ + .reset = rst, \ + .reg = name, \ + .visibility = filter, \ + .val = v, \ +} + #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) @@ -2120,8 +2340,8 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, * HCR_EL2.E2H==1, and only in the sysreg table for convenience of * handling traps. Given that, they are always hidden from userspace. */ -static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) { return REG_HIDDEN_USER; } @@ -2132,7 +2352,7 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .reset = rst, \ .reg = name##_EL1, \ .val = v, \ - .visibility = elx2_visibility, \ + .visibility = hidden_user_visibility, \ } /* @@ -2146,44 +2366,44 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, * from userspace. */ -#define ID_DESC(name) \ - SYS_DESC(SYS_##name), \ +#define ID_DESC_DEFAULT_CALLBACKS \ .access = access_id_reg, \ - .get_user = get_id_reg \ - -/* sys_reg_desc initialiser for known cpufeature ID registers */ -#define ID_SANITISED(name) { \ - ID_DESC(name), \ + .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ - .val = 0, \ -} + .reset = kvm_read_sanitised_id_reg + +#define ID_DESC(name) \ + SYS_DESC(SYS_##name), \ + ID_DESC_DEFAULT_CALLBACKS /* sys_reg_desc initialiser for known cpufeature ID registers */ -#define AA32_ID_SANITISED(name) { \ +#define ID_SANITISED(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ - .visibility = aa32_id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = mask, \ } +/* + * 32bit ID regs are fully writable when the guest is 32bit + * capable. Nothing in the KVM code should rely on 32bit features + * anyway, only 64bit, so let the VMM do its worse. + */ +#define AA32_ID_WRITABLE(name) { \ + ID_DESC(name), \ + .visibility = aa32_id_visibility, \ + .val = GENMASK(31, 0), \ +} + /* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ #define ID_FILTERED(sysreg, name, mask) { \ ID_DESC(sysreg), \ .set_user = set_##name, \ - .visibility = id_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = (mask), \ } @@ -2193,12 +2413,10 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, * (1 <= crm < 8, 0 <= Op2 < 8). */ #define ID_UNALLOCATED(crm, op2) { \ + .name = "S3_0_0_" #crm "_" #op2, \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ - .access = access_id_reg, \ - .get_user = get_id_reg, \ - .set_user = set_id_reg, \ + ID_DESC_DEFAULT_CALLBACKS, \ .visibility = raz_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } @@ -2209,9 +2427,7 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, */ #define ID_HIDDEN(name) { \ ID_DESC(name), \ - .set_user = set_id_reg, \ .visibility = raz_visibility, \ - .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } @@ -2261,6 +2477,42 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return __vcpu_sys_reg(vcpu, r->reg) = val; } +static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + unsigned int (*fn)(const struct kvm_vcpu *, + const struct sys_reg_desc *)) +{ + return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); +} + +static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, sve_visibility); +} + +static bool access_zcr_el2(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned int vq; + + if (guest_hyp_sve_traps_enabled(vcpu)) { + kvm_inject_nested_sve_trap(vcpu); + return true; + } + + if (!p->is_write) { + p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2); + return true; + } + + vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; + vq = min(vq, vcpu_sve_max_vq(vcpu)); + vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2); + return true; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2307,7 +2559,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, - { SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 }, + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, @@ -2318,40 +2570,39 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ - AA32_ID_SANITISED(ID_PFR0_EL1), - AA32_ID_SANITISED(ID_PFR1_EL1), + AA32_ID_WRITABLE(ID_PFR0_EL1), + AA32_ID_WRITABLE(ID_PFR1_EL1), { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, .set_user = set_id_dfr0_el1, .visibility = aa32_id_visibility, .reset = read_sanitised_id_dfr0_el1, - .val = ID_DFR0_EL1_PerfMon_MASK | - ID_DFR0_EL1_CopDbg_MASK, }, + .val = GENMASK(31, 0) }, ID_HIDDEN(ID_AFR0_EL1), - AA32_ID_SANITISED(ID_MMFR0_EL1), - AA32_ID_SANITISED(ID_MMFR1_EL1), - AA32_ID_SANITISED(ID_MMFR2_EL1), - AA32_ID_SANITISED(ID_MMFR3_EL1), + AA32_ID_WRITABLE(ID_MMFR0_EL1), + AA32_ID_WRITABLE(ID_MMFR1_EL1), + AA32_ID_WRITABLE(ID_MMFR2_EL1), + AA32_ID_WRITABLE(ID_MMFR3_EL1), /* CRm=2 */ - AA32_ID_SANITISED(ID_ISAR0_EL1), - AA32_ID_SANITISED(ID_ISAR1_EL1), - AA32_ID_SANITISED(ID_ISAR2_EL1), - AA32_ID_SANITISED(ID_ISAR3_EL1), - AA32_ID_SANITISED(ID_ISAR4_EL1), - AA32_ID_SANITISED(ID_ISAR5_EL1), - AA32_ID_SANITISED(ID_MMFR4_EL1), - AA32_ID_SANITISED(ID_ISAR6_EL1), + AA32_ID_WRITABLE(ID_ISAR0_EL1), + AA32_ID_WRITABLE(ID_ISAR1_EL1), + AA32_ID_WRITABLE(ID_ISAR2_EL1), + AA32_ID_WRITABLE(ID_ISAR3_EL1), + AA32_ID_WRITABLE(ID_ISAR4_EL1), + AA32_ID_WRITABLE(ID_ISAR5_EL1), + AA32_ID_WRITABLE(ID_MMFR4_EL1), + AA32_ID_WRITABLE(ID_ISAR6_EL1), /* CRm=3 */ - AA32_ID_SANITISED(MVFR0_EL1), - AA32_ID_SANITISED(MVFR1_EL1), - AA32_ID_SANITISED(MVFR2_EL1), + AA32_ID_WRITABLE(MVFR0_EL1), + AA32_ID_WRITABLE(MVFR1_EL1), + AA32_ID_WRITABLE(MVFR2_EL1), ID_UNALLOCATED(3,3), - AA32_ID_SANITISED(ID_PFR2_EL1), + AA32_ID_WRITABLE(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), - AA32_ID_SANITISED(ID_MMFR5_EL1), + AA32_ID_WRITABLE(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ @@ -2361,7 +2612,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP)), ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, @@ -2386,7 +2636,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,7), /* CRm=5 */ + /* + * Prior to FEAT_Debugv8.9, the architecture defines context-aware + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). + * KVM does not trap + emulate the breakpoint registers, and as such + * cannot support a layout that misaligns with the underlying hardware. + * While it may be possible to describe a subset that aligns with + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for + * simplicity. + * + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking + * of breakpoints for more details. + */ ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | ID_AA64DFR0_EL1_PMUVer_MASK | ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), @@ -2426,14 +2690,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), - ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 | + ID_FILTERED(ID_AA64MMFR2_EL1, + id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 | ID_AA64MMFR2_EL1_EVT | ID_AA64MMFR2_EL1_FWB | ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), - ID_SANITISED(ID_AA64MMFR3_EL1), - ID_SANITISED(ID_AA64MMFR4_EL1), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1PIE | + ID_AA64MMFR3_EL1_S1POE)), + ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), @@ -2463,6 +2730,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -2522,18 +2791,31 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, - { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, - { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, @@ -2546,11 +2828,14 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, - .set_user = set_clidr }, + .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, - { SYS_DESC(SYS_CTR_EL0), access_ctr }, + ID_WRITABLE(CTR_EL0, CTR_EL0_DIC_MASK | + CTR_EL0_IDC_MASK | + CTR_EL0_DminLine_MASK | + CTR_EL0_IminLine_MASK), { SYS_DESC(SYS_SVCR), undef_access }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, @@ -2674,11 +2959,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, + /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), @@ -2763,15 +3054,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), EL2_REG_VNCR(HACR_EL2, reset_val, 0), + EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, + sve_el2_visibility), + EL2_REG_VNCR(HCRX_EL2, reset_val, 0), EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG(TCR2_EL2, access_tcr2_el2, reset_val, TCR2_EL2_RES1), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), - { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), @@ -2779,11 +3074,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_REDIR(ELR_EL2, reset_val, 0), { SYS_DESC(SYS_SP_EL1), access_sp_el1}, - { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, + /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi, + .visibility = hidden_user_visibility }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi, + .visibility = hidden_user_visibility }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi, + .visibility = hidden_user_visibility }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi, + .visibility = hidden_user_visibility }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), - { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), @@ -2804,32 +3109,515 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), - { SYS_DESC(SYS_RMR_EL2), trap_undef }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + + EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, + EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), + EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), + + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer }, + EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0), EL12_REG(CNTKCTL, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, + + { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, + EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s1e01(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + __kvm_at_s1e2(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s12(vcpu, op, p->regval); + + return true; +} + +static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_nROS && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + return true; +} + +static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + write_lock(&vcpu->kvm->mmu_lock); + + /* + * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the + * corresponding VMIDs. + */ + kvm_nested_s2_unmap(vcpu->kvm, true); + + write_unlock(&vcpu->kvm->mmu_lock); + + return true; +} + +static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr) +{ + struct kvm *kvm = vpcu->kvm; + u8 CRm = sys_reg_CRm(instr); + u8 Op2 = sys_reg_Op2(instr); + + if (sys_reg_CRn(instr) == TLBI_CRn_nXS && + !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + return false; + + if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + return false; + + if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) && + !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + return false; + + return true; +} + +/* Only defined here as this is an internal "abstraction" */ +union tlbi_info { + struct { + u64 start; + u64 size; + } range; + + struct { + u64 addr; + } ipa; + + struct { + u64 addr; + u32 encoding; + } va; +}; + +static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + /* + * The unmap operation is allowed to drop the MMU lock and block, which + * means that @mmu could be used for a different context than the one + * currently being invalidated. + * + * This behavior is still safe, as: + * + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating + * the entire MMU before reusing it, which still honors the intent + * of a TLBI. + * + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC + * and ERET to the guest), other vCPUs are allowed to use stale + * translations. + * + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and + * at worst may cause more aborts for shadow stage-2 fills. + * + * Dropping the MMU lock also implies that shadow stage-2 fills could + * happen behind the back of the TLBI. This is still safe, though, as + * the L1 needs to put its stage-2 in a consistent state before doing + * the TLBI. + */ + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); +} + +static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 limit, vttbr; + + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = 0, + .size = limit, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + u64 base, range, tg, num, scale; + int shift; + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + /* + * Because the shadow S2 structure doesn't necessarily reflect that + * of the guest's S2 (different base granule size, for example), we + * decide to ignore TTL and only use the described range. + */ + tg = FIELD_GET(GENMASK(47, 46), p->regval); + scale = FIELD_GET(GENMASK(45, 44), p->regval); + num = FIELD_GET(GENMASK(43, 39), p->regval); + base = p->regval & GENMASK(36, 0); + + switch(tg) { + case 1: + shift = 12; + break; + case 2: + shift = 14; + break; + case 3: + default: /* IMPDEF: handle tg==0 as 64k */ + shift = 16; + break; + } + + base <<= shift; + range = __TLBI_RANGE_PAGES(num, scale) << shift; + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .range = { + .start = base, + .size = range, + }, + }, + s2_mmu_unmap_range); + + return true; +} + +static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + unsigned long max_size; + u64 base_addr; + + /* + * We drop a number of things from the supplied value: + * + * - NS bit: we're non-secure only. + * + * - IPA[51:48]: We don't support 52bit IPA just yet... + * + * And of course, adjust the IPA to be on an actual address. + */ + base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12; + max_size = compute_tlb_inval_range(mmu, info->ipa.addr); + base_addr &= ~(max_size - 1); + + /* + * See comment in s2_mmu_unmap_range() for why this is allowed to + * reschedule. + */ + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); +} + +static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .ipa = { + .addr = p->regval, + }, + }, + s2_mmu_unmap_ipa); + + return true; +} + +static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, + const union tlbi_info *info) +{ + WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); +} + +static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + + /* + * If we're here, this is because we've trapped on a EL1 TLBI + * instruction that affects the EL1 translation regime while + * we're running in a context that doesn't allow us to let the + * HW do its thing (aka vEL2): + * + * - HCR_EL2.E2H == 0 : a non-VHE guest + * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode + * + * We don't expect these helpers to ever be called when running + * in a vEL1 context. + */ + + WARN_ON(!vcpu_is_el2(vcpu)); + + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + &(union tlbi_info) { + .va = { + .addr = p->regval, + .encoding = sys_encoding, + }, + }, + s2_mmu_tlbi_s1e1); + + return true; +} + +#define SYS_INSN(insn, access_fn) \ + { \ + SYS_DESC(OP_##insn), \ + .access = (access_fn), \ + } + static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, { SYS_DESC(SYS_DC_CISW), access_dcsw }, { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, -}; -static const struct sys_reg_desc *first_idreg; + SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1), + + SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1), + + SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), + SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OS, undef_access), + SYS_INSN(TLBI_VAE2OS, undef_access), + SYS_INSN(TLBI_ALLE1OS, handle_alle1is), + SYS_INSN(TLBI_VALE2OS, undef_access), + SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2IS, undef_access), + SYS_INSN(TLBI_RVALE2IS, undef_access), + + SYS_INSN(TLBI_ALLE1IS, handle_alle1is), + SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OS, undef_access), + SYS_INSN(TLBI_RVALE2OS, undef_access), + SYS_INSN(TLBI_RVAE2, undef_access), + SYS_INSN(TLBI_RVALE2, undef_access), + SYS_INSN(TLBI_ALLE1, handle_alle1is), + SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), + + SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), + + SYS_INSN(TLBI_ALLE2OSNXS, undef_access), + SYS_INSN(TLBI_VAE2OSNXS, undef_access), + SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2OSNXS, undef_access), + SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), + + SYS_INSN(TLBI_RVAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVALE2ISNXS, undef_access), + SYS_INSN(TLBI_ALLE2ISNXS, undef_access), + SYS_INSN(TLBI_VAE2ISNXS, undef_access), + + SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), + SYS_INSN(TLBI_VALE2ISNXS, undef_access), + SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), + SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is), + SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), + SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), + SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), + SYS_INSN(TLBI_RVAE2OSNXS, undef_access), + SYS_INSN(TLBI_RVALE2OSNXS, undef_access), + SYS_INSN(TLBI_RVAE2NXS, undef_access), + SYS_INSN(TLBI_RVALE2NXS, undef_access), + SYS_INSN(TLBI_ALLE2NXS, undef_access), + SYS_INSN(TLBI_VAE2NXS, undef_access), + SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), + SYS_INSN(TLBI_VALE2NXS, undef_access), + SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), +}; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -2838,7 +3626,7 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, if (p->is_write) { return ignore_write(vcpu, p); } else { - u64 dfr = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); + u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1); u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | @@ -3005,6 +3793,7 @@ static const struct sys_reg_desc cp15_regs[] = { /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, @@ -3054,8 +3843,28 @@ static const struct sys_reg_desc cp15_regs[] = { /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, - /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, @@ -3146,9 +3955,11 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, @@ -3511,6 +4322,25 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, return false; } +static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) +{ + unsigned long i, idreg_idx = 0; + + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (!is_vm_ftr_id_reg(reg_to_encoding(r))) + continue; + + if (idreg_idx == pos) + return r; + + idreg_idx++; + } + + return NULL; +} + static void *idregs_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; @@ -3522,7 +4352,7 @@ static void *idregs_debug_start(struct seq_file *s, loff_t *pos) if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && *iter == (u8)~0) { *iter = *pos; - if (*iter >= KVM_ARM_ID_REG_NUM) + if (!idregs_debug_find(kvm, *iter)) iter = NULL; } else { iter = ERR_PTR(-EBUSY); @@ -3539,7 +4369,7 @@ static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) (*pos)++; - if ((kvm->arch.idreg_debugfs_iter + 1) < KVM_ARM_ID_REG_NUM) { + if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { kvm->arch.idreg_debugfs_iter++; return &kvm->arch.idreg_debugfs_iter; @@ -3564,16 +4394,16 @@ static void idregs_debug_stop(struct seq_file *s, void *v) static int idregs_debug_show(struct seq_file *s, void *v) { - struct kvm *kvm = s->private; const struct sys_reg_desc *desc; + struct kvm *kvm = s->private; - desc = first_idreg + kvm->arch.idreg_debugfs_iter; + desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); if (!desc->name) return 0; seq_printf(s, "%20s:\t%016llx\n", - desc->name, IDREG(kvm, IDX_IDREG(kvm->arch.idreg_debugfs_iter))); + desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc))); return 0; } @@ -3595,26 +4425,24 @@ void kvm_sys_regs_create_debugfs(struct kvm *kvm) &idregs_debug_fops); } -static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) +static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) { - const struct sys_reg_desc *idreg = first_idreg; - u32 id = reg_to_encoding(idreg); + u32 id = reg_to_encoding(reg); struct kvm *kvm = vcpu->kvm; if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) return; - lockdep_assert_held(&kvm->arch.config_lock); - - /* Initialize all idregs */ - while (is_id_reg(id)) { - IDREG(kvm, id) = idreg->reset(vcpu, idreg); + kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg)); +} - idreg++; - id = reg_to_encoding(idreg); - } +static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *reg) +{ + if (kvm_vcpu_initialized(vcpu)) + return; - set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); + reg->reset(vcpu, reg); } /** @@ -3626,19 +4454,27 @@ static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) */ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; unsigned long i; - kvm_reset_id_regs(vcpu); - for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *r = &sys_reg_descs[i]; - if (is_id_reg(reg_to_encoding(r))) + if (!r->reset) continue; - if (r->reset) + if (is_vm_ftr_id_reg(reg_to_encoding(r))) + reset_vm_ftr_id_reg(vcpu, r); + else if (is_vcpu_ftr_id_reg(reg_to_encoding(r))) + reset_vcpu_ftr_id_reg(vcpu, r); + else r->reset(vcpu, r); + + if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) + (void)__vcpu_sys_reg(vcpu, r->reg); } + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); } /** @@ -3753,8 +4589,8 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, */ #define FUNCTION_INVARIANT(reg) \ - static u64 get_##reg(struct kvm_vcpu *v, \ - const struct sys_reg_desc *r) \ + static u64 reset_##reg(struct kvm_vcpu *v, \ + const struct sys_reg_desc *r) \ { \ ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ return ((struct sys_reg_desc *)r)->val; \ @@ -3764,18 +4600,11 @@ FUNCTION_INVARIANT(midr_el1) FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(aidr_el1) -static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) -{ - ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); - return ((struct sys_reg_desc *)r)->val; -} - /* ->val is filled in by kvm_sys_reg_table_init() */ static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { - { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, - { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, - { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 }, - { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, + { SYS_DESC(SYS_MIDR_EL1), NULL, reset_midr_el1 }, + { SYS_DESC(SYS_REVIDR_EL1), NULL, reset_revidr_el1 }, + { SYS_DESC(SYS_AIDR_EL1), NULL, reset_aidr_el1 }, }; static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) @@ -4064,14 +4893,6 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) sys_reg_CRm(r), \ sys_reg_Op2(r)) -static bool is_feature_id_reg(u32 encoding) -{ - return (sys_reg_Op0(encoding) == 3 && - (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && - sys_reg_CRn(encoding) == 0 && - sys_reg_CRm(encoding) <= 7); -} - int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range) { const void *zero_page = page_to_virt(ZERO_PAGE(0)); @@ -4094,20 +4915,11 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range * if (!is_feature_id_reg(encoding) || !reg->set_user) continue; - /* - * For ID registers, we return the writable mask. Other feature - * registers return a full 64bit mask. That's not necessary - * compliant with a given revision of the architecture, but the - * RES0/RES1 definitions allow us to do that. - */ - if (is_id_reg(encoding)) { - if (!reg->val || - (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) - continue; - val = reg->val; - } else { - val = ~0UL; + if (!reg->val || + (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) { + continue; } + val = reg->val; if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding)))) return -EFAULT; @@ -4116,11 +4928,34 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range * return 0; } -void kvm_init_sysreg(struct kvm_vcpu *vcpu) +static void vcpu_set_hcr(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - mutex_lock(&kvm->arch.config_lock); + if (has_vhe() || has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_el1_is_32bit(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_RW; + + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; /* * In the absence of FGT, we cannot independently trap TLBI @@ -4129,12 +4964,30 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu) */ if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) vcpu->arch.hcr_el2 |= HCR_TTLBOS; +} + +void kvm_calculate_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); if (cpus_have_final_cap(ARM64_HAS_HCX)) { - vcpu->arch.hcrx_el2 = HCRX_GUEST_FLAGS; + /* + * In general, all HCRX_EL2 bits are gated by a feature. + * The only reason we can set SMPME without checking any + * feature is that its effects are not directly observable + * from the guest. + */ + vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME; if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); + + if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; } if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) @@ -4175,6 +5028,13 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu) HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) + kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; + + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | + HFGITR_EL2_ATS1E1WP); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); @@ -4188,9 +5048,44 @@ void kvm_init_sysreg(struct kvm_vcpu *vcpu) mutex_unlock(&kvm->arch.config_lock); } +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. + * + * Because this can be called once per CPU, changes must be idempotent. + */ +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + /* + * This hacks into the ID registers, so only perform it when the + * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. + */ + if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) { + u64 val; + + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(vcpu); + if (ret) + return ret; + } + + return 0; +} + int __init kvm_sys_reg_table_init(void) { - struct sys_reg_params params; bool valid = true; unsigned int i; int ret = 0; @@ -4211,12 +5106,6 @@ int __init kvm_sys_reg_table_init(void) for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); - /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */ - params = encoding_to_params(SYS_ID_PFR0_EL1); - first_idreg = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); - if (!first_idreg) - return -EINVAL; - ret = populate_nv_trap_config(); for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 997eea21ba2ab3d7b08ae8f6178d208b74453606..927c48198164a359b46e24986f98d88a93d6b81d 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -235,6 +235,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x @@ -248,4 +250,21 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + +#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ +({ \ + u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ + (val) &= ~reg##_##field##_MASK; \ + (val) |= FIELD_PREP(reg##_##field##_MASK, \ + min(__f_val, \ + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ + (val); \ +}) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index b4b2b3ca465da0cb5958aaf5a6e60eb24e758477..dd2b8d2b3c88d9343c3f09890c4516766223fb7e 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -71,6 +71,7 @@ void kvm_vgic_early_init(struct kvm *kvm) int kvm_vgic_create(struct kvm *kvm, u32 type) { struct kvm_vcpu *vcpu; + u64 aa64pfr0, pfr1; unsigned long i; int ret; @@ -119,10 +120,19 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - else + } else { INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); out_unlock: mutex_unlock(&kvm->arch.config_lock); @@ -182,27 +192,22 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -/** - * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data - * structures and register VCPU-specific KVM iodevs - * - * @vcpu: pointer to the VCPU being created and initialized - * - * Only do initialization, but do not actually enable the - * VGIC CPU interface - */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int ret = 0; int i; - vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + lockdep_assert_held(&vcpu->kvm->arch.config_lock); - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - raw_spin_lock_init(&vgic_cpu->ap_list_lock); - atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (vgic_cpu->private_irqs) + return 0; + + vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS, + sizeof(struct vgic_irq), + GFP_KERNEL_ACCOUNT); + + if (!vgic_cpu->private_irqs) + return -ENOMEM; /* * Enable and configure all SGIs to be edge-triggered and @@ -227,9 +232,48 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) } } + return 0; +} + +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu) +{ + int ret; + + mutex_lock(&vcpu->kvm->arch.config_lock); + ret = vgic_allocate_private_irqs_locked(vcpu); + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return ret; +} + +/** + * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data + * structures and register VCPU-specific KVM iodevs + * + * @vcpu: pointer to the VCPU being created and initialized + * + * Only do initialization, but do not actually enable the + * VGIC CPU interface + */ +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int ret = 0; + + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); + if (!irqchip_in_kernel(vcpu->kvm)) return 0; + ret = vgic_allocate_private_irqs(vcpu); + if (ret) + return ret; + /* * If we are creating a VCPU with a GICv3 we must also register the * KVM io device for the redistributor that belongs to this VCPU. @@ -285,10 +329,13 @@ int vgic_init(struct kvm *kvm) /* Initialize groups on CPUs created before the VGIC type was known */ kvm_for_each_vcpu(idx, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + ret = vgic_allocate_private_irqs_locked(vcpu); + if (ret) + goto out; for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + struct vgic_irq *irq = vgic_get_irq(kvm, vcpu, i); + switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V3: irq->group = 1; @@ -300,8 +347,12 @@ int vgic_init(struct kvm *kvm) break; default: ret = -EINVAL; - goto out; } + + vgic_put_irq(kvm, irq); + + if (ret) + goto out; } } @@ -379,8 +430,29 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) vgic_flush_pending_lpis(vcpu); INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_unregister_redist_iodev(vcpu); + /* + * If this vCPU is being destroyed because of a failed creation + * then unregister the redistributor to avoid leaving behind a + * dangling pointer to the vCPU struct. + * + * vCPUs that have been successfully created (i.e. added to + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as + * this function gets called while holding kvm->arch.config_lock + * in the VM teardown path and would otherwise introduce a lock + * inversion w.r.t. kvm->srcu. + * + * vCPUs that failed creation are torn down outside of the + * kvm->arch.config_lock and do not get unregistered in + * kvm_vgic_destroy(), meaning it is both safe and necessary to + * do so here. + */ + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) + vgic_unregister_redist_iodev(vcpu); + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; } } @@ -411,6 +483,11 @@ void kvm_vgic_destroy(struct kvm *kvm) kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + mutex_unlock(&kvm->slots_lock); } diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 7e9cdb78f7ce885f9917d380d99fa18547af47e6..ae5a44d5702d14d23fec89c355d4b44b26ceba56 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -464,17 +464,10 @@ void vgic_v2_load(struct kvm_vcpu *vcpu) kvm_vgic_global_state.vctrl_base + GICH_APR); } -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); -} - void vgic_v2_put(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - vgic_v2_vmcr_sync(vcpu); + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index e9d69806427cb3e35da0bc2461a31b0dbc11b2cd..a50410b69ed10dc4a25a95a5635fb82149d1e96c 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -292,6 +292,18 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; +} + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* Hide GICv3 sysreg if necessary */ + if (!kvm_has_gicv3(vcpu->kvm)) { + vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC; + return; + } + if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_TALL0; if (group1_trap) @@ -728,15 +740,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - /* - * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen - * is dependent on ICC_SRE_EL1.SRE, and we have to perform the - * VMCR_EL2 save/restore in the world switch. - */ - if (likely(cpu_if->vgic_sre)) - kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - - kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -744,24 +748,13 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) WARN_ON(vgic_v4_load(vcpu)); } -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - - if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); -} - void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); - vgic_v3_vmcr_sync(vcpu); - - kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); - if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index db2a95762b1b663e7ccc2c45184f8a8af1fd4ffd..954effc3dc367df3d1f894e404413b3b99d6e28d 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -36,6 +36,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * we have to disable IRQs before taking this lock and everything lower * than it. * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you @@ -937,10 +942,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -948,26 +956,18 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); } -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - return; - - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_vmcr_sync(vcpu); - else - vgic_v3_vmcr_sync(vcpu); -} - int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 7cb5ee7d912d56bd5ebcd8b0bfcd79229d8beeba..3eb04f32541da25756c9ee7440440ee552d94f6d 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -238,7 +238,6 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); -void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); @@ -269,7 +268,6 @@ bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); -void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); @@ -366,11 +364,11 @@ void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + static inline bool kvm_has_gicv3(struct kvm *kvm) { - return (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && - irqchip_in_kernel(kvm) && - kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } extern struct gic_kvm_info *gic_kvm_info; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a44ed0dda4b4f83c4769aba59b17e8b0645077b4..aae5d1c85d9a49f03995c4695902567ea52ccc16 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -261,16 +261,14 @@ static bool is_el1_data_abort(unsigned long esr) static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { - unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE; - if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) return false; - if (fsc_type == ESR_ELx_FSC_PERM) + if (esr_fsc_is_permission_fault(esr)) return true; if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && + return esr_fsc_is_translation_fault(esr) && (regs->pstate & PSR_PAN_BIT); return false; @@ -283,8 +281,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long flags; u64 par, dfsc; - if (!is_el1_data_abort(esr) || - (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT) + if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr)) return false; local_irq_save(flags); @@ -305,7 +302,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, * treat the translation fault as spurious. */ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par); - return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT; + return !esr_fsc_is_translation_fault(dfsc); } static void die_kernel_fault(const char *msg, unsigned long addr, @@ -372,11 +369,6 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) return false; } -static bool is_translation_fault(unsigned long esr) -{ - return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; -} - static void __do_kernel_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { @@ -409,7 +401,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (is_translation_fault(esr) && + if (esr_fsc_is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return; @@ -800,18 +792,18 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 8" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 12" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 19" }, + { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, @@ -819,7 +811,7 @@ static const struct fault_info fault_info[] = { { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 27" }, + { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented @@ -833,9 +825,9 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, + { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 8f5b7ce857ed4a8f2271e02579223324fca854d5..adcf547f74eb8e609d9eddab127bcae5bb1f8f5c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -73,6 +73,10 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } + if (lpa2_is_enabled()) + for (int i = 0; i < ARRAY_SIZE(protection_map); i++) + pgprot_val(protection_map[i]) &= ~PTE_SHARED; + return 0; } arch_initcall(adjust_protection_map); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 14fdf645edc887776266d3553c0aeda7c75040da..5284aff6d8d1564019afc97ca3e381ec9565612b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -459,11 +459,14 @@ SYM_FUNC_START(__cpu_setup) ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_indirection +#define PTE_MAYBE_SHARED 0 mov_q x0, PIE_E0 msr REG_PIRE0_EL1, x0 mov_q x0, PIE_E1 msr REG_PIR_EL1, x0 +#undef PTE_MAYBE_SHARED + mov x0, TCR2_EL1x_PIE msr REG_TCR2_EL1, x0 diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 3123696a5fd42ab146818becfd9da9769224bd83..8d120994cf06fcd3d1876f0f99e67e647005e02d 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -39,6 +39,7 @@ HAS_GIC_PRIO_RELAXED_SYNC HAS_HCR_NV1 HAS_HCX HAS_LDAPR +HAS_LPA2 HAS_LSE_ATOMICS HAS_MOPS HAS_NESTED_VIRT @@ -103,6 +104,7 @@ WORKAROUND_CLEAN_CACHE WORKAROUND_DEVICE_LOAD_ACQUIRE WORKAROUND_NVIDIA_CARMEL_CNP WORKAROUND_QCOM_FALKOR_E1003 +WORKAROUND_QCOM_ORYON_CNTVOFF WORKAROUND_REPEAT_TLBI WORKAROUND_SPECULATIVE_AT WORKAROUND_SPECULATIVE_SSBS diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 6f1714fdbe04ce8a724162cf8d2bfaf1b3e643fe..5798ce738ad3ab51cc058515db0861c6db0a998b 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1330,7 +1330,7 @@ UnsignedEnum 55:52 BRBE 0b0001 IMP 0b0010 BRBE_V1P1 EndEnum -Enum 51:48 MTPMU +SignedEnum 51:48 MTPMU 0b0000 NI_IMPDEF 0b0001 IMP 0b1111 NI @@ -1338,6 +1338,7 @@ EndEnum UnsignedEnum 47:44 TraceBuffer 0b0000 NI 0b0001 IMP + 0b0010 TRBE_V1P1 EndEnum UnsignedEnum 43:40 TraceFilt 0b0000 NI @@ -1353,11 +1354,19 @@ UnsignedEnum 35:32 PMSVer 0b0010 V1P1 0b0011 V1P2 0b0100 V1P3 + 0b0101 V1P4 + 0b0110 V1P5 EndEnum Field 31:28 CTX_CMPs -Res0 27:24 +UnsignedEnum 27:24 SEBEP + 0b0000 NI + 0b0001 IMP +EndEnum Field 23:20 WRPs -Res0 19:16 +UnsignedEnum 19:16 PMSS + 0b0000 NI + 0b0001 IMP +EndEnum Field 15:12 BRPs UnsignedEnum 11:8 PMUVer 0b0000 NI @@ -1367,6 +1376,7 @@ UnsignedEnum 11:8 PMUVer 0b0110 V3P5 0b0111 V3P7 0b1000 V3P8 + 0b1001 V3P9 0b1111 IMP_DEF EndEnum UnsignedEnum 7:4 TraceVer @@ -1379,6 +1389,7 @@ UnsignedEnum 3:0 DebugVer 0b1000 V8P2 0b1001 V8P4 0b1010 V8P8 + 0b1011 V8P9 EndEnum EndSysreg @@ -1415,6 +1426,32 @@ Field 15:8 BRPs Field 7:0 SYSPMUID EndSysreg +Sysreg ID_AA64DFR2_EL1 3 0 0 5 2 +Res0 63:28 +UnsignedEnum 27:24 TRBE_EXC + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 23:20 SPE_nVM + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 19:16 SPE_EXC + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 15:8 +UnsignedEnum 7:4 BWE + 0b0000 NI + 0b0001 FEAT_BWE + 0b0002 FEAT_BWE2 +EndEnum +UnsignedEnum 3:0 STEP + 0b0000 NI + 0b0001 IMP +EndEnum +EndSysreg + Sysreg ID_AA64AFR0_EL1 3 0 0 5 4 Res0 63:32 Field 31:28 IMPDEF7 @@ -1706,16 +1743,16 @@ Enum 35:32 TGRAN16_2 0b0010 IMP 0b0011 52_BIT EndEnum -Enum 31:28 TGRAN4 +SignedEnum 31:28 TGRAN4 0b0000 IMP 0b0001 52_BIT 0b1111 NI EndEnum -Enum 27:24 TGRAN64 +SignedEnum 27:24 TGRAN64 0b0000 IMP 0b1111 NI EndEnum -Enum 23:20 TGRAN16 +UnsignedEnum 23:20 TGRAN16 0b0000 NI 0b0001 IMP 0b0010 52_BIT @@ -1863,7 +1900,7 @@ Enum 23:20 CCIDX 0b0000 32 0b0001 64 EndEnum -Enum 19:16 VARange +UnsignedEnum 19:16 VARange 0b0000 48 0b0001 52 EndEnum diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index cbbc9a6dc571587db49b5e9b965f9e3610b83c55..ce6521ad04d12160a40d0d4b7ebd843a6b900a0d 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -22,6 +22,12 @@ #define CNTHCTL_EVNTDIR (1 << 3) #define CNTHCTL_EVNTI (0xF << 4) #define CNTHCTL_ECV (1 << 12) +#define CNTHCTL_EL1TVT (1 << 13) +#define CNTHCTL_EL1TVCT (1 << 14) +#define CNTHCTL_EL1NVPCT (1 << 15) +#define CNTHCTL_EL1NVVCT (1 << 16) +#define CNTHCTL_CNTVMASK (1 << 18) +#define CNTHCTL_CNTPMASK (1 << 19) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 8968a85c4155c1a469609a2480891a3bb5b6d34d..a551c15bd6bbf25c76c2d0cbb19cdc542470c9a9 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -97,8 +97,9 @@ struct arch_timer_cpu { int __init kvm_timer_hyp_init(bool has_gic); void kvm_timer_hyp_uninit(void); int kvm_timer_enable(struct kvm_vcpu *vcpu); -int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu); void kvm_timer_sync_user(struct kvm_vcpu *vcpu); bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); void kvm_timer_update_run(struct kvm_vcpu *vcpu); @@ -148,9 +149,34 @@ u64 timer_get_cval(struct arch_timer_context *ctxt); void kvm_timer_cpu_up(void); void kvm_timer_cpu_down(void); +/* CNTKCTL_EL1 valid bits as of DDI0487J.a */ +#define CNTKCTL_VALID_BITS (BIT(17) | GENMASK_ULL(9, 0)) + +DECLARE_STATIC_KEY_FALSE(broken_cntvoff_key); + +static inline bool has_broken_cntvoff(void) +{ + return static_branch_unlikely(&broken_cntvoff_key); +} + static inline bool has_cntpoff(void) { return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); } +static inline u64 timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (!ctxt) + return 0; + + if (ctxt->offset.vm_offset) + offset += *ctxt->offset.vm_offset; + if (ctxt->offset.vcpu_offset) + offset += *ctxt->offset.vcpu_offset; + + return offset; +} + #endif diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 4f41b4161ba6c58f1469801cafc8fc509a8503cc..8924bd1314d436cfa705c70d9318e3adb6b73538 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -50,7 +50,8 @@ extern struct mutex arm_pmus_lock; #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); @@ -80,7 +81,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); #define kvm_vcpu_has_pmu(vcpu) \ - (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) + (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3)) /* * Updates the vcpu's view of the pmu events for this cpu. @@ -99,6 +100,8 @@ int kvm_arm_set_default_pmu(struct kvm *kvm); u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx); +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu); #else struct kvm_pmu { }; @@ -116,7 +119,11 @@ static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, } static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} -static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) { return 0; } @@ -190,6 +197,13 @@ static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) return 0; } +static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + return false; +} + +static inline void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) {} + #endif #endif diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index 6e55b9283789b148f76030e63de34de03fc35cc0..e8fb624013d15c275fdfff258826ca90b3bb35da 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -26,7 +26,7 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu) * revisions. It is thus safe to return the latest, unless * userspace has instructed us otherwise. */ - if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features)) { + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) { if (vcpu->kvm->arch.psci_version) return vcpu->kvm->arch.psci_version; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 694d48469fb7cbcb9ae6f4f15fa8cd093ee911be..33375782f320e63d392746baebd20eec2092c98e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -329,7 +329,7 @@ struct vgic_cpu { struct vgic_v3_cpu_if vgic_v3; }; - struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; + struct vgic_irq *private_irqs; raw_spinlock_t ap_list_lock; /* Protects the ap_list */ @@ -388,7 +388,6 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); -void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f7753eca807c5e9d9802bdb25cfe9d452934f731..435b858192b0d5326e94936b8b4bd6a9c8c68c8c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -381,6 +381,7 @@ struct kvm_vcpu { #endif bool preempted; bool ready; + bool scheduled_out; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 197ba0aebf7ac415e88e6a23de3f85c1376f0124..e5809c1f6cd0d322eb446672d7a2584a47898c6b 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -160,6 +160,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access +TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer TEST_GEN_PROGS_aarch64 += demand_paging_test diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c new file mode 100644 index 0000000000000000000000000000000000000000..943d65fc6b0b9f5c5d8d213c069091236e9a51ab --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Check that, on a GICv3 system, not configuring GICv3 correctly +// results in all of the sysregs generating an UNDEF exception. + +#include +#include +#include + +static volatile bool handled; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + handled = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +#define __check_sr_write(r) \ + do { \ + handled = false; \ + dsb(sy); \ + write_sysreg_s(0, SYS_ ## r); \ + isb(); \ + } while(0) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(handled, #r " no read trap"); \ + } while(0) + +#define check_sr_write(r) \ + do { \ + __check_sr_write(r); \ + __GUEST_ASSERT(handled, #r " no write trap"); \ + } while(0) + +#define check_sr_rw(r) \ + do { \ + check_sr_read(r); \ + check_sr_write(r); \ + } while(0) + +static void guest_code(void) +{ + uint64_t val; + + /* + * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having + * hidden the feature at runtime without any other userspace action. + */ + __GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), + read_sysreg(id_aa64pfr0_el1)) == 0, + "GICv3 wrongly advertised"); + + /* + * Access all GICv3 registers, and fail if we don't get an UNDEF. + * Note that we happily access all the APxRn registers without + * checking their existance, as all we want to see is a failure. + */ + check_sr_rw(ICC_PMR_EL1); + check_sr_read(ICC_IAR0_EL1); + check_sr_write(ICC_EOIR0_EL1); + check_sr_rw(ICC_HPPIR0_EL1); + check_sr_rw(ICC_BPR0_EL1); + check_sr_rw(ICC_AP0R0_EL1); + check_sr_rw(ICC_AP0R1_EL1); + check_sr_rw(ICC_AP0R2_EL1); + check_sr_rw(ICC_AP0R3_EL1); + check_sr_rw(ICC_AP1R0_EL1); + check_sr_rw(ICC_AP1R1_EL1); + check_sr_rw(ICC_AP1R2_EL1); + check_sr_rw(ICC_AP1R3_EL1); + check_sr_write(ICC_DIR_EL1); + check_sr_read(ICC_RPR_EL1); + check_sr_write(ICC_SGI1R_EL1); + check_sr_write(ICC_ASGI1R_EL1); + check_sr_write(ICC_SGI0R_EL1); + check_sr_read(ICC_IAR1_EL1); + check_sr_write(ICC_EOIR1_EL1); + check_sr_rw(ICC_HPPIR1_EL1); + check_sr_rw(ICC_BPR1_EL1); + check_sr_rw(ICC_CTLR_EL1); + check_sr_rw(ICC_IGRPEN0_EL1); + check_sr_rw(ICC_IGRPEN1_EL1); + + /* + * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can + * be RAO/WI. Engage in non-fatal accesses, starting with a + * write of 0 to try and disable SRE, and let's see if it + * sticks. + */ + __check_sr_write(ICC_SRE_EL1); + if (!handled) + GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n"); + + val = __check_sr_read(ICC_SRE_EL1); + if (!handled) { + __GUEST_ASSERT((val & BIT(0)), + "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n"); + GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n"); + } + + GUEST_DONE(); +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + /* Success, we've gracefully exploded! */ + handled = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_no_gicv3(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Create a VM without a GICv3 */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t pfr0; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &pfr0); + __TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0), + "GICv3 not supported."); + kvm_vm_free(vm); + + test_guest_no_gicv3(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index e37a10ceec90e057b9a5242c43d19f7e52bc921c..882657db95a29d2523ea2c38d1f04c0a38b7e9d1 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -68,6 +68,8 @@ struct test_feature_reg { } static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = { + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DoubleLock, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, WRPs, 0), S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP), REG_FTR_END, @@ -126,6 +128,7 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0), + REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0), @@ -226,6 +229,7 @@ static void guest_code(void) GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); + GUEST_REG_SYNC(SYS_CTR_EL0); GUEST_DONE(); } @@ -334,8 +338,8 @@ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) return ftr; } -static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, - const struct reg_ftr_bits *ftr_bits) +static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, + const struct reg_ftr_bits *ftr_bits) { uint8_t shift = ftr_bits->shift; uint64_t mask = ftr_bits->mask; @@ -353,6 +357,8 @@ static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, vcpu_set_reg(vcpu, reg, val); vcpu_get_reg(vcpu, reg, &new_val); TEST_ASSERT_EQ(new_val, val); + + return new_val; } static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, @@ -381,7 +387,15 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, TEST_ASSERT_EQ(val, old_val); } -static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) +static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + +#define encoding_to_range_idx(encoding) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding), \ + sys_reg_CRn(encoding), sys_reg_CRm(encoding), \ + sys_reg_Op2(encoding)) + + +static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) { uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; struct reg_mask_range range = { @@ -405,9 +419,7 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) int idx; /* Get the index to masks array for the idreg */ - idx = KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(reg_id), sys_reg_Op1(reg_id), - sys_reg_CRn(reg_id), sys_reg_CRm(reg_id), - sys_reg_Op2(reg_id)); + idx = encoding_to_range_idx(reg_id); for (int j = 0; ftr_bits[j].type != FTR_END; j++) { /* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */ @@ -421,7 +433,9 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask); test_reg_set_fail(vcpu, reg, &ftr_bits[j]); - test_reg_set_success(vcpu, reg, &ftr_bits[j]); + + test_reg_vals[idx] = test_reg_set_success(vcpu, reg, + &ftr_bits[j]); ksft_test_result_pass("%s\n", ftr_bits[j].name); } @@ -527,7 +541,6 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; struct ucall uc; - uint64_t val; while (!done) { vcpu_run(vcpu); @@ -538,8 +551,8 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) break; case UCALL_SYNC: /* Make sure the written values are seen by guest */ - vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(uc.args[2]), &val); - TEST_ASSERT_EQ(val, uc.args[3]); + TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])], + uc.args[3]); break; case UCALL_DONE: done = true; @@ -550,13 +563,100 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) } } +/* Politely lifted from arch/arm64/include/asm/cache.h */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +static void test_clidr(struct kvm_vcpu *vcpu) +{ + uint64_t clidr; + int level; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), &clidr); + + /* find the first empty level in the cache hierarchy */ + for (level = 1; level < 7; level++) { + if (!CLIDR_CTYPE(clidr, level)) + break; + } + + /* + * If you have a mind-boggling 7 levels of cache, congratulations, you + * get to fix this. + */ + TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy"); + + /* stick in a unified cache level */ + clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level); + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr); + test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr; +} + +static void test_ctr(struct kvm_vcpu *vcpu) +{ + u64 ctr; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), &ctr); + ctr &= ~CTR_EL0_DIC_MASK; + if (ctr & CTR_EL0_IminLine_MASK) + ctr--; + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), ctr); + test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + u64 val; + + test_clidr(vcpu); + test_ctr(vcpu); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + + test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; + ksft_test_result_pass("%s\n", __func__); +} + +static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding) +{ + size_t idx = encoding_to_range_idx(encoding); + uint64_t observed; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding), &observed); + TEST_ASSERT_EQ(test_reg_vals[idx], observed); +} + +static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) +{ + /* + * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an + * architectural reset of the vCPU. + */ + aarch64_vcpu_setup(vcpu, NULL); + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) + test_assert_id_reg_unchanged(vcpu, test_regs[i].reg); + + test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); + test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0); + + ksft_test_result_pass("%s\n", __func__); +} + int main(void) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; bool aarch64_only; uint64_t val, el0; - int ftr_cnt; + int test_cnt; TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); @@ -569,21 +669,25 @@ int main(void) ksft_print_header(); - ftr_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + - ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + - ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + - ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + - ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + - MPAM_IDREG_TEST; + test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 + + MPAM_IDREG_TEST; - ksft_set_plan(ftr_cnt); + ksft_set_plan(test_cnt); + + test_vm_ftr_id_regs(vcpu, aarch64_only); + test_vcpu_ftr_id_regs(vcpu); - test_user_set_reg(vcpu, aarch64_only); test_user_set_mpam_reg(vcpu); test_guest_reg_read(vcpu); + test_reset_preserves_id_regs(vcpu); + kvm_vm_free(vm); ksft_finished(); diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 16ae0ac01879af47669e2a2bbe53479eed874332..9e518b56282736caeaf549e1d504ce53f9fc64e3 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -119,8 +119,8 @@ enum { /* Access flag update enable/disable */ #define TCR_EL1_HA (1ULL << 39) -void aarch64_get_supported_page_sizes(uint32_t ipa, - bool *ps4k, bool *ps16k, bool *ps64k); +void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, + uint32_t *ipa16k, uint32_t *ipa64k); void vm_init_descriptor_tables(struct kvm_vm *vm); void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); diff --git a/tools/testing/selftests/kvm/include/guest_modes.h b/tools/testing/selftests/kvm/include/guest_modes.h index b691df33e64e122a87996ef5cc17e8aceead4685..63f5167397ccb4bf9ed07ce90b99cc89b36bd20c 100644 --- a/tools/testing/selftests/kvm/include/guest_modes.h +++ b/tools/testing/selftests/kvm/include/guest_modes.h @@ -11,8 +11,8 @@ struct guest_mode { extern struct guest_mode guest_modes[NUM_VM_MODES]; -#define guest_mode_append(mode, supported, enabled) ({ \ - guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +#define guest_mode_append(mode, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ (enabled), (enabled) }; \ }) void guest_modes_append_default(void); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 4c4a2e3b8eee2169e387f048d9022353b3a39682..63d9399a06b032695ebd5546e55eb28fa62fdcf1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -172,6 +172,7 @@ static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, enum vm_guest_mode { VM_MODE_P52V48_4K, + VM_MODE_P52V48_16K, VM_MODE_P52V48_64K, VM_MODE_P48V48_4K, VM_MODE_P48V48_16K, diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6fe12e985ba568bea7b2a7829730f503b674af76..41c776b642c0cd0be722e4bad1e0e9cc1f0cff80 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -12,6 +12,7 @@ #include "kvm_util.h" #include "processor.h" #include +#include #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 @@ -58,13 +59,25 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva) return (gva >> vm->page_shift) & mask; } +static inline bool use_lpa2_pte_format(struct kvm_vm *vm) +{ + return (vm->page_size == SZ_4K || vm->page_size == SZ_16K) && + (vm->pa_bits > 48 || vm->va_bits > 48); +} + static uint64_t addr_pte(struct kvm_vm *vm, uint64_t pa, uint64_t attrs) { uint64_t pte; - pte = pa & GENMASK(47, vm->page_shift); - if (vm->page_shift == 16) - pte |= FIELD_GET(GENMASK(51, 48), pa) << 12; + if (use_lpa2_pte_format(vm)) { + pte = pa & GENMASK(49, vm->page_shift); + pte |= FIELD_GET(GENMASK(51, 50), pa) << 8; + attrs &= ~GENMASK(9, 8); + } else { + pte = pa & GENMASK(47, vm->page_shift); + if (vm->page_shift == 16) + pte |= FIELD_GET(GENMASK(51, 48), pa) << 12; + } pte |= attrs; return pte; @@ -74,9 +87,14 @@ static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte) { uint64_t pa; - pa = pte & GENMASK(47, vm->page_shift); - if (vm->page_shift == 16) - pa |= FIELD_GET(GENMASK(15, 12), pte) << 48; + if (use_lpa2_pte_format(vm)) { + pa = pte & GENMASK(49, vm->page_shift); + pa |= FIELD_GET(GENMASK(9, 8), pte) << 50; + } else { + pa = pte & GENMASK(47, vm->page_shift); + if (vm->page_shift == 16) + pa |= FIELD_GET(GENMASK(15, 12), pte) << 48; + } return pa; } @@ -266,9 +284,6 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* Configure base granule size */ switch (vm->mode) { - case VM_MODE_P52V48_4K: - TEST_FAIL("AArch64 does not support 4K sized pages " - "with 52-bit physical address ranges"); case VM_MODE_PXXV48_4K: TEST_FAIL("AArch64 does not support 4K sized pages " "with ANY-bit physical address ranges"); @@ -278,12 +293,14 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) case VM_MODE_P36V48_64K: tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ break; + case VM_MODE_P52V48_16K: case VM_MODE_P48V48_16K: case VM_MODE_P40V48_16K: case VM_MODE_P36V48_16K: case VM_MODE_P36V47_16K: tcr_el1 |= 2ul << 14; /* TG0 = 16KB */ break; + case VM_MODE_P52V48_4K: case VM_MODE_P48V48_4K: case VM_MODE_P40V48_4K: case VM_MODE_P36V48_4K: @@ -297,6 +314,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* Configure output size */ switch (vm->mode) { + case VM_MODE_P52V48_4K: + case VM_MODE_P52V48_16K: case VM_MODE_P52V48_64K: tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2; @@ -325,6 +344,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */; tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; + if (use_lpa2_pte_format(vm)) + tcr_el1 |= (1ul << 59) /* DS */; vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); @@ -492,12 +513,24 @@ uint32_t guest_get_vcpuid(void) return read_sysreg(tpidr_el1); } -void aarch64_get_supported_page_sizes(uint32_t ipa, - bool *ps4k, bool *ps16k, bool *ps64k) +static uint32_t max_ipa_for_page_size(uint32_t vm_ipa, uint32_t gran, + uint32_t not_sup_val, uint32_t ipa52_min_val) +{ + if (gran == not_sup_val) + return 0; + else if (gran >= ipa52_min_val && vm_ipa >= 52) + return 52; + else + return min(vm_ipa, 48U); +} + +void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, + uint32_t *ipa16k, uint32_t *ipa64k) { struct kvm_vcpu_init preferred_init; int kvm_fd, vm_fd, vcpu_fd, err; uint64_t val; + uint32_t gran; struct kvm_one_reg reg = { .id = KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR0_EL1), .addr = (uint64_t)&val, @@ -518,9 +551,17 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); - *ps4k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val) != 0xf; - *ps64k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val) == 0; - *ps16k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val) != 0; + gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val); + *ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI, + ID_AA64MMFR0_EL1_TGRAN4_52_BIT); + + gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val); + *ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI, + ID_AA64MMFR0_EL1_TGRAN64_IMP); + + gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val); + *ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI, + ID_AA64MMFR0_EL1_TGRAN16_52_BIT); close(vcpu_fd); close(vm_fd); diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index 1df3ce4b16fd86a3f1750700d1392872a60cbf92..b04901e5513874b5c51606b6b63145806b875b58 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -14,37 +14,33 @@ struct guest_mode guest_modes[NUM_VM_MODES]; void guest_modes_append_default(void) { #ifndef __aarch64__ - guest_mode_append(VM_MODE_DEFAULT, true, true); + guest_mode_append(VM_MODE_DEFAULT, true); #else { unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); - bool ps4k, ps16k, ps64k; + uint32_t ipa4k, ipa16k, ipa64k; int i; - aarch64_get_supported_page_sizes(limit, &ps4k, &ps16k, &ps64k); + aarch64_get_supported_page_sizes(limit, &ipa4k, &ipa16k, &ipa64k); - vm_mode_default = NUM_VM_MODES; + guest_mode_append(VM_MODE_P52V48_4K, ipa4k >= 52); + guest_mode_append(VM_MODE_P52V48_16K, ipa16k >= 52); + guest_mode_append(VM_MODE_P52V48_64K, ipa64k >= 52); - if (limit >= 52) - guest_mode_append(VM_MODE_P52V48_64K, ps64k, ps64k); - if (limit >= 48) { - guest_mode_append(VM_MODE_P48V48_4K, ps4k, ps4k); - guest_mode_append(VM_MODE_P48V48_16K, ps16k, ps16k); - guest_mode_append(VM_MODE_P48V48_64K, ps64k, ps64k); - } - if (limit >= 40) { - guest_mode_append(VM_MODE_P40V48_4K, ps4k, ps4k); - guest_mode_append(VM_MODE_P40V48_16K, ps16k, ps16k); - guest_mode_append(VM_MODE_P40V48_64K, ps64k, ps64k); - if (ps4k) - vm_mode_default = VM_MODE_P40V48_4K; - } - if (limit >= 36) { - guest_mode_append(VM_MODE_P36V48_4K, ps4k, ps4k); - guest_mode_append(VM_MODE_P36V48_16K, ps16k, ps16k); - guest_mode_append(VM_MODE_P36V48_64K, ps64k, ps64k); - guest_mode_append(VM_MODE_P36V47_16K, ps16k, ps16k); - } + guest_mode_append(VM_MODE_P48V48_4K, ipa4k >= 48); + guest_mode_append(VM_MODE_P48V48_16K, ipa16k >= 48); + guest_mode_append(VM_MODE_P48V48_64K, ipa64k >= 48); + + guest_mode_append(VM_MODE_P40V48_4K, ipa4k >= 40); + guest_mode_append(VM_MODE_P40V48_16K, ipa16k >= 40); + guest_mode_append(VM_MODE_P40V48_64K, ipa64k >= 40); + + guest_mode_append(VM_MODE_P36V48_4K, ipa4k >= 36); + guest_mode_append(VM_MODE_P36V48_16K, ipa16k >= 36); + guest_mode_append(VM_MODE_P36V48_64K, ipa64k >= 36); + guest_mode_append(VM_MODE_P36V47_16K, ipa16k >= 36); + + vm_mode_default = ipa4k >= 40 ? VM_MODE_P40V48_4K : NUM_VM_MODES; /* * Pick the first supported IPA size if the default @@ -72,7 +68,7 @@ void guest_modes_append_default(void) close(kvm_fd); /* Starting with z13 we have 47bits of physical address */ if (info.ibc >= 0x30) - guest_mode_append(VM_MODE_P47V64_4K, true, true); + guest_mode_append(VM_MODE_P47V64_4K, true); } #endif #ifdef __riscv @@ -80,9 +76,9 @@ void guest_modes_append_default(void) unsigned int sz = kvm_check_cap(KVM_CAP_VM_GPA_BITS); if (sz >= 52) - guest_mode_append(VM_MODE_P52V48_4K, true, true); + guest_mode_append(VM_MODE_P52V48_4K, true); if (sz >= 48) - guest_mode_append(VM_MODE_P48V48_4K, true, true); + guest_mode_append(VM_MODE_P48V48_4K, true); } #endif } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bc1ef536b640c6a4528c98afefbc9ef0b5ee9284..d272459ea18c8cb94d560689b364d3be99108ea3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -148,6 +148,7 @@ const char *vm_guest_mode_string(uint32_t i) { static const char * const strings[] = { [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages", + [VM_MODE_P52V48_16K] = "PA-bits:52, VA-bits:48, 16K pages", [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages", [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages", [VM_MODE_P48V48_16K] = "PA-bits:48, VA-bits:48, 16K pages", @@ -173,6 +174,7 @@ const char *vm_guest_mode_string(uint32_t i) const struct vm_guest_mode_params vm_guest_mode_params[] = { [VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 }, + [VM_MODE_P52V48_16K] = { 52, 48, 0x4000, 14 }, [VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 }, [VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 }, [VM_MODE_P48V48_16K] = { 48, 48, 0x4000, 14 }, @@ -251,6 +253,7 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) case VM_MODE_P36V48_64K: vm->pgtable_levels = 3; break; + case VM_MODE_P52V48_16K: case VM_MODE_P48V48_16K: case VM_MODE_P40V48_16K: case VM_MODE_P36V48_16K: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcd8b7639f21231b23bc9d5f1e1b0ed807fa7e86..4e1181d45aff3ce21fd2beccf09d388ace929ac5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6445,6 +6445,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) __this_cpu_write(kvm_running_vcpu, vcpu); kvm_arch_sched_in(vcpu, cpu); kvm_arch_vcpu_load(vcpu, cpu); + + WRITE_ONCE(vcpu->scheduled_out, false); } static void kvm_sched_out(struct preempt_notifier *pn, @@ -6452,6 +6454,8 @@ static void kvm_sched_out(struct preempt_notifier *pn, { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + WRITE_ONCE(vcpu->scheduled_out, true); + if (task_is_runnable(current)) { WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true);