From 72519105a96b4279b98943f471376cf535d01ae0 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Sat, 16 Apr 2022 02:45:32 +0000 Subject: [PATCH 1/8] x86/apic: Do apic driver probe for "nosmp" use case ANBZ: #4829 commit 7a116a2dd32d96869f0f93bac00b900859ba0434 upstream. For the "nosmp" use case, the APIC initialization code selects "APIC_SYMMETRIC_IO_NO_ROUTING" as the default interrupt mode and avoids probing APIC drivers. This works well for the default APIC modes, but for the x2APIC case the probe function is required to allocate the cluster_hotplug mask. So in the APIC_SYMMETRIC_IO_NO_ROUTING case when the x2APIC is initialized it dereferences a NULL pointer and the kernel crashes. This was observed on a TDX platform where x2APIC is enabled and "nosmp" command line option is allowed. To fix this issue, probe APIC drivers via default_setup_apic_routing() for the APIC_SYMMETRIC_IO_NO_ROUTING interrupt mode too. Intel-SIG: commit 7a116a2dd32d x86/apic: Do apic driver probe for "nosmp" use case. Minor fixup for tdx guest core features. Suggested-by: Kirill A. Shutemov Suggested-by: Rafael J. Wysocki Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/a64f864e1114bcd63593286aaf61142cfce384ea.1650076869.git.sathyanarayanan.kuppuswamy@intel.com [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/kernel/apic/apic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8768a1acf162..b8236b2aa1a9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1422,22 +1422,21 @@ void __init apic_intr_mode_init(void) return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - default_setup_apic_routing(); break; case APIC_VIRTUAL_WIRE_NO_CONFIG: pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); upmode = true; - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmetric I/O mode setup\n"); - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } + default_setup_apic_routing(); + if (x86_platform.apic_post_init) x86_platform.apic_post_init(); -- Gitee From d850f826492db386a649fd4fa95b5683fbf16894 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2022 10:38:39 +0200 Subject: [PATCH 2/8] x86/tdx: Fix RETs in TDX asm ANBZ: #4829 commit c796f02162e428b595ff70196dca161ee46b163b upstream. Because build-testing is over-rated, fix a few trivial objtool complaints: vmlinux.o: warning: objtool: __tdx_module_call+0x3e: missing int3 after ret vmlinux.o: warning: objtool: __tdx_hypercall+0x6e: missing int3 after ret Intel-SIG: commit c796f02162e4 x86/tdx: Fix RETs in TDX asm. Fixes: eb94f1b6a70a ("x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220520083839.GR2578@worktop.programming.kicks-ass.net [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/coco/tdx/tdcall.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index 245888290bb6..f5a44e648c91 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -73,7 +73,7 @@ SYM_FUNC_START(__tdx_module_call) FRAME_BEGIN TDX_MODULE_CALL host=0 FRAME_END - ret + RET SYM_FUNC_END(__tdx_module_call) /* @@ -196,7 +196,7 @@ SYM_FUNC_START(__tdx_hypercall) FRAME_END - retq + RET .Lpanic: call __tdx_hypercall_failed /* __tdx_hypercall_failed never returns */ -- Gitee From 8fa72d925944d5f94158fd724485ea820c20b640 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Jun 2022 15:01:33 +0300 Subject: [PATCH 3/8] x86/tdx: Fix early #VE handling ANBZ: #4829 commit 60428d8bc27f52e8f1540f98e1b6ef0156d43f0d upstream. tdx_early_handle_ve() does not increment RIP after successfully handling the exception. That leads to infinite loop of exceptions. Move RIP when exceptions are successfully handled. [ dhansen: make problem statement more clear ] Intel-SIG: commit 60428d8bc27f x86/tdx: Fix early #VE handling. Fixes: 32e72854fa5f ("x86/tdx: Port I/O: Add early boot support") Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lkml.kernel.org/r/20220614120135.14812-2-kirill.shutemov@linux.intel.com [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/coco/tdx/tdx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 90c3e4d599d6..2f68a6e9480d 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -448,13 +448,17 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) __init bool tdx_early_handle_ve(struct pt_regs *regs) { struct ve_info ve; + bool ret; tdx_get_ve_info(&ve); if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) return false; - return handle_io(regs, ve.exit_qual); + ret = handle_io(regs, ve.exit_qual); + if (ret) + regs->ip += ve.instr_len; + return ret; } void tdx_get_ve_info(struct ve_info *ve) -- Gitee From 81efcfdbea1c4ab381e3c09adb067ae40c229060 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Jun 2022 15:01:34 +0300 Subject: [PATCH 4/8] x86/tdx: Clarify RIP adjustments in #VE handler ANBZ: #4829 commit cdd85786f4b3b9273e4376e69aa95a2d71722764 upstream. After successful #VE handling, tdx_handle_virt_exception() has to move RIP to the next instruction. The handler needs to know the length of the instruction. If the #VE happened due to instruction execution, the GET_VEINFO TDX module call provides info on the instruction in R10, including its length. For #VE due to EPT violation, the info in R10 is not populand and the kernel must decode the instruction manually to find out its length. Restructure the code to make it explicit that the instruction length depends on the type of #VE. Make individual #VE handlers return the instruction length on success or -errno on failure. [ dhansen: fix up changelog and comments ] Intel-SIG: commit cdd85786f4b3 x86/tdx: Clarify RIP adjustments in #VE handler. Suggested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220614120135.14812-3-kirill.shutemov@linux.intel.com [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/coco/tdx/tdx.c | 178 +++++++++++++++++++++++++++------------- 1 file changed, 123 insertions(+), 55 deletions(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 2f68a6e9480d..2240f8bb15f3 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -125,6 +125,51 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +/* + * The TDX module spec states that #VE may be injected for a limited set of + * reasons: + * + * - Emulation of the architectural #VE injection on EPT violation; + * + * - As a result of guest TD execution of a disallowed instruction, + * a disallowed MSR access, or CPUID virtualization; + * + * - A notification to the guest TD about anomalous behavior; + * + * The last one is opt-in and is not used by the kernel. + * + * The Intel Software Developer's Manual describes cases when instruction + * length field can be used in section "Information for VM Exits Due to + * Instruction Execution". + * + * For TDX, it ultimately means GET_VEINFO provides reliable instruction length + * information if #VE occurred due to instruction execution, but not for EPT + * violations. + */ +static int ve_instr_len(struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_HLT: + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + case EXIT_REASON_CPUID: + case EXIT_REASON_IO_INSTRUCTION: + /* It is safe to use ve->instr_len for #VE due instructions */ + return ve->instr_len; + case EXIT_REASON_EPT_VIOLATION: + /* + * For EPT violations, ve->insn_len is not defined. For those, + * the kernel must decode instructions manually and should not + * be using this function. + */ + WARN_ONCE(1, "ve->instr_len is not defined for EPT violations"); + return 0; + default: + WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason); + return ve->instr_len; + } +} + static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) { struct tdx_hypercall_args args = { @@ -148,7 +193,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); } -static bool handle_halt(void) +static int handle_halt(struct ve_info *ve) { /* * Since non safe halt is mainly used in CPU offlining @@ -159,9 +204,9 @@ static bool handle_halt(void) const bool do_sti = false; if (__halt(irq_disabled, do_sti)) - return false; + return -EIO; - return true; + return ve_instr_len(ve); } void __cpuidle tdx_safe_halt(void) @@ -181,7 +226,7 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } -static bool read_msr(struct pt_regs *regs) +static int read_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -195,14 +240,14 @@ static bool read_msr(struct pt_regs *regs) * (GHCI), section titled "TDG.VP.VMCALL". */ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) - return false; + return -EIO; regs->ax = lower_32_bits(args.r11); regs->dx = upper_32_bits(args.r11); - return true; + return ve_instr_len(ve); } -static bool write_msr(struct pt_regs *regs) +static int write_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -216,10 +261,13 @@ static bool write_msr(struct pt_regs *regs) * can be found in TDX Guest-Host-Communication Interface * (GHCI) section titled "TDG.VP.VMCALL". */ - return !__tdx_hypercall(&args, 0); + if (__tdx_hypercall(&args, 0)) + return -EIO; + + return ve_instr_len(ve); } -static bool handle_cpuid(struct pt_regs *regs) +static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -237,7 +285,7 @@ static bool handle_cpuid(struct pt_regs *regs) */ if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { regs->ax = regs->bx = regs->cx = regs->dx = 0; - return true; + return ve_instr_len(ve); } /* @@ -246,7 +294,7 @@ static bool handle_cpuid(struct pt_regs *regs) * (GHCI), section titled "VP.VMCALL". */ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) - return false; + return -EIO; /* * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of @@ -258,7 +306,7 @@ static bool handle_cpuid(struct pt_regs *regs) regs->cx = args.r14; regs->dx = args.r15; - return true; + return ve_instr_len(ve); } static bool mmio_read(int size, unsigned long addr, unsigned long *val) @@ -284,7 +332,7 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val) EPT_WRITE, addr, val); } -static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) +static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) { char buffer[MAX_INSN_SIZE]; unsigned long *reg, val; @@ -295,34 +343,36 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) /* Only in-kernel MMIO is supported */ if (WARN_ON_ONCE(user_mode(regs))) - return false; + return -EFAULT; if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) - return false; + return -EFAULT; if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) - return false; + return -EINVAL; mmio = insn_decode_mmio(&insn, &size); if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) - return false; + return -EINVAL; if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { reg = insn_get_modrm_reg_ptr(&insn, regs); if (!reg) - return false; + return -EINVAL; } - ve->instr_len = insn.length; - /* Handle writes first */ switch (mmio) { case MMIO_WRITE: memcpy(&val, reg, size); - return mmio_write(size, ve->gpa, val); + if (!mmio_write(size, ve->gpa, val)) + return -EIO; + return insn.length; case MMIO_WRITE_IMM: val = insn.immediate.value; - return mmio_write(size, ve->gpa, val); + if (!mmio_write(size, ve->gpa, val)) + return -EIO; + return insn.length; case MMIO_READ: case MMIO_READ_ZERO_EXTEND: case MMIO_READ_SIGN_EXTEND: @@ -335,15 +385,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) * decoded or handled properly. It was likely not using io.h * helpers or accessed MMIO accidentally. */ - return false; + return -EINVAL; default: WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); - return false; + return -EINVAL; } /* Handle reads */ if (!mmio_read(size, ve->gpa, &val)) - return false; + return -EIO; switch (mmio) { case MMIO_READ: @@ -365,13 +415,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) default: /* All other cases has to be covered with the first switch() */ WARN_ON_ONCE(1); - return false; + return -EINVAL; } if (extend_size) memset(reg, extend_val, extend_size); memcpy(reg, &val, size); - return true; + return insn.length; } static bool handle_in(struct pt_regs *regs, int size, int port) @@ -422,13 +472,14 @@ static bool handle_out(struct pt_regs *regs, int size, int port) * * Return True on success or False on failure. */ -static bool handle_io(struct pt_regs *regs, u32 exit_qual) +static int handle_io(struct pt_regs *regs, struct ve_info *ve) { + u32 exit_qual = ve->exit_qual; int size, port; - bool in; + bool in, ret; if (VE_IS_IO_STRING(exit_qual)) - return false; + return -EIO; in = VE_IS_IO_IN(exit_qual); size = VE_GET_IO_SIZE(exit_qual); @@ -436,9 +487,13 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) if (in) - return handle_in(regs, size, port); + ret = handle_in(regs, size, port); else - return handle_out(regs, size, port); + ret = handle_out(regs, size, port); + if (!ret) + return -EIO; + + return ve_instr_len(ve); } /* @@ -448,17 +503,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) __init bool tdx_early_handle_ve(struct pt_regs *regs) { struct ve_info ve; - bool ret; + int insn_len; tdx_get_ve_info(&ve); if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) return false; - ret = handle_io(regs, ve.exit_qual); - if (ret) - regs->ip += ve.instr_len; - return ret; + insn_len = handle_io(regs, &ve); + if (insn_len < 0) + return false; + + regs->ip += insn_len; + return true; } void tdx_get_ve_info(struct ve_info *ve) @@ -491,54 +548,65 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } -/* Handle the user initiated #VE */ -static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve) +/* + * Handle the user initiated #VE. + * + * On success, returns the number of bytes RIP should be incremented (>=0) + * or -errno on error. + */ +static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve) { switch (ve->exit_reason) { case EXIT_REASON_CPUID: - return handle_cpuid(regs); + return handle_cpuid(regs, ve); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); - return false; + return -EIO; } } -/* Handle the kernel #VE */ -static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) +/* + * Handle the kernel #VE. + * + * On success, returns the number of bytes RIP should be incremented (>=0) + * or -errno on error. + */ +static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) { switch (ve->exit_reason) { case EXIT_REASON_HLT: - return handle_halt(); + return handle_halt(ve); case EXIT_REASON_MSR_READ: - return read_msr(regs); + return read_msr(regs, ve); case EXIT_REASON_MSR_WRITE: - return write_msr(regs); + return write_msr(regs, ve); case EXIT_REASON_CPUID: - return handle_cpuid(regs); + return handle_cpuid(regs, ve); case EXIT_REASON_EPT_VIOLATION: return handle_mmio(regs, ve); case EXIT_REASON_IO_INSTRUCTION: - return handle_io(regs, ve->exit_qual); + return handle_io(regs, ve); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); - return false; + return -EIO; } } bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) { - bool ret; + int insn_len; if (user_mode(regs)) - ret = virt_exception_user(regs, ve); + insn_len = virt_exception_user(regs, ve); else - ret = virt_exception_kernel(regs, ve); + insn_len = virt_exception_kernel(regs, ve); + if (insn_len < 0) + return false; /* After successful #VE handling, move the IP */ - if (ret) - regs->ip += ve->instr_len; + regs->ip += insn_len; - return ret; + return true; } static bool tdx_tlb_flush_required(bool private) -- Gitee From 258a04a2798769e55a486df5fd8b032d714d07a3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Jun 2022 15:01:35 +0300 Subject: [PATCH 5/8] x86/tdx: Handle load_unaligned_zeropad() page-cross to a shared page ANBZ: #4829 commit 1e7769653b06b56b7ea7d56911d2d5b2957750cd upstream. load_unaligned_zeropad() can lead to unwanted loads across page boundaries. The unwanted loads are typically harmless. But, they might be made to totally unrelated or even unmapped memory. load_unaligned_zeropad() relies on exception fixup (#PF, #GP and now #VE) to recover from these unwanted loads. In TDX guests, the second page can be shared page and a VMM may configure it to trigger #VE. The kernel assumes that #VE on a shared page is an MMIO access and tries to decode instruction to handle it. In case of load_unaligned_zeropad() it may result in confusion as it is not MMIO access. Fix it by detecting split page MMIO accesses and failing them. load_unaligned_zeropad() will recover using exception fixups. The issue was discovered by analysis and reproduced artificially. It was not triggered during testing. [ dhansen: fix up changelogs and comments for grammar and clarity, plus incorporate Kirill's off-by-one fix] Intel-SIG: commit 1e7769653b06 x86/tdx: Handle load_unaligned_zeropad() page-cross to a shared page. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220614120135.14812-4-kirill.shutemov@linux.intel.com [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/coco/tdx/tdx.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 2240f8bb15f3..65fa18b7e365 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -334,8 +334,8 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val) static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) { + unsigned long *reg, val, vaddr; char buffer[MAX_INSN_SIZE]; - unsigned long *reg, val; struct insn insn = {}; enum mmio_type mmio; int size, extend_size; @@ -361,6 +361,19 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EINVAL; } + /* + * Reject EPT violation #VEs that split pages. + * + * MMIO accesses are supposed to be naturally aligned and therefore + * never cross page boundaries. Seeing split page accesses indicates + * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. + * + * load_unaligned_zeropad() will recover using exception fixups. + */ + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs); + if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) + return -EFAULT; + /* Handle writes first */ switch (mmio) { case MMIO_WRITE: -- Gitee From 0d534804d468fc7a92e57ef12270bbcce23bb505 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Tue, 16 Aug 2022 16:19:42 -0700 Subject: [PATCH 6/8] x86/apic: Don't disable x2APIC if locked ANBZ: #4829 commit b8d1d163604bd1e600b062fb00de5dc42baa355f upstream. The APIC supports two modes, legacy APIC (or xAPIC), and Extended APIC (or x2APIC). X2APIC mode is mostly compatible with legacy APIC, but it disables the memory-mapped APIC interface in favor of one that uses MSRs. The APIC mode is controlled by the EXT bit in the APIC MSR. The MMIO/xAPIC interface has some problems, most notably the APIC LEAK [1]. This bug allows an attacker to use the APIC MMIO interface to extract data from the SGX enclave. Introduce support for a new feature that will allow the BIOS to lock the APIC in x2APIC mode. If the APIC is locked in x2APIC mode and the kernel tries to disable the APIC or revert to legacy APIC mode a GP fault will occur. Introduce support for a new MSR (IA32_XAPIC_DISABLE_STATUS) and handle the new locked mode when the LEGACY_XAPIC_DISABLED bit is set by preventing the kernel from trying to disable the x2APIC. On platforms with the IA32_XAPIC_DISABLE_STATUS MSR, if SGX or TDX are enabled the LEGACY_XAPIC_DISABLED will be set by the BIOS. If legacy APIC is required, then it SGX and TDX need to be disabled in the BIOS. [1]: https://aepicleak.com/aepicleak.pdf Intel-SIG: commit b8d1d163604b x86/apic: Don't disable x2APIC if locked. Signed-off-by: Daniel Sneddon Signed-off-by: Dave Hansen Acked-by: Dave Hansen Tested-by: Neelima Krishnan Link: https://lkml.kernel.org/r/20220816231943.1152579-1-daniel.sneddon@linux.intel.com [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- .../admin-guide/kernel-parameters.txt | 4 ++ arch/x86/Kconfig | 7 ++- arch/x86/include/asm/cpu.h | 2 + arch/x86/include/asm/msr-index.h | 13 ++++++ arch/x86/kernel/apic/apic.c | 44 +++++++++++++++++-- 5 files changed, 65 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4f81a14aa76a..d32656e3a8cb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3522,6 +3522,10 @@ nox2apic [X86-64,APIC] Do not enable x2APIC mode. + NOTE: this parameter will be ignored on systems with the + LEGACY_XAPIC_DISABLED bit set in the + IA32_XAPIC_DISABLE_STATUS MSR. + cpu0_hotplug [X86] Turn on CPU0 hotplug feature when CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off. Some features depend on CPU0. Known dependencies are: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f0c048d20479..d377bb044136 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -444,6 +444,11 @@ config X86_X2APIC This allows 32-bit apic IDs (so it can support very large systems), and accesses the local apic via MSRs not via mmio. + Some Intel systems circa 2022 and later are locked into x2APIC mode + and can not fall back to the legacy APIC modes if SGX or TDX are + enabled in the BIOS. They will be unable to boot without enabling + this option. + If you don't know what to do here, say N. config X86_MPPARSE @@ -1991,7 +1996,7 @@ endchoice config X86_SGX bool "Software Guard eXtensions (SGX)" - depends on X86_64 && CPU_SUP_INTEL + depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC depends on CRYPTO=y depends on CRYPTO_SHA256=y select SRCU diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index f43d37309bb1..8a9c04d3748c 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -99,6 +99,8 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, return p1 & p2; } +extern u64 x86_read_arch_cap_msr(void); + int intel_find_matching_signature(void *mc, unsigned int csig, int cpf); int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 06434c9e16a7..be0920445efe 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -156,6 +156,11 @@ * Return Stack Buffer Predictions. */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ + #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* * Writeback and invalidate the @@ -1044,6 +1049,14 @@ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 +/* x2APIC locked status */ +#define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +#define LEGACY_XAPIC_DISABLED BIT(0) /* + * x2APIC mode is locked and + * disabling x2APIC will cause + * a #GP + */ + /* Intel SEAMRR */ #define MSR_IA32_SEAMRR_PHYS_BASE 0x00001400 #define MSR_IA32_SEAMRR_PHYS_MASK 0x00001401 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b8236b2aa1a9..5123fe32ca20 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -60,6 +60,7 @@ #include #include #include +#include unsigned int num_processors; @@ -1749,11 +1750,26 @@ int x2apic_mode; enum { X2APIC_OFF, - X2APIC_ON, X2APIC_DISABLED, + /* All states below here have X2APIC enabled */ + X2APIC_ON, + X2APIC_ON_LOCKED }; static int x2apic_state; +static bool x2apic_hw_locked(void) +{ + u64 ia32_cap; + u64 msr; + + ia32_cap = x86_read_arch_cap_msr(); + if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + return (msr & LEGACY_XAPIC_DISABLED); + } + return false; +} + static void __x2apic_disable(void) { u64 msr; @@ -1791,6 +1807,10 @@ static int __init setup_nox2apic(char *str) apicid); return 0; } + if (x2apic_hw_locked()) { + pr_warn("APIC locked in x2apic mode, can't disable\n"); + return 0; + } pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } @@ -1805,10 +1825,18 @@ early_param("nox2apic", setup_nox2apic); void x2apic_setup(void) { /* - * If x2apic is not in ON state, disable it if already enabled + * Try to make the AP's APIC state match that of the BSP, but if the + * BSP is unlocked and the AP is locked then there is a state mismatch. + * Warn about the mismatch in case a GP fault occurs due to a locked AP + * trying to be turned off. + */ + if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked()) + pr_warn("x2apic lock mismatch between BSP and AP.\n"); + /* + * If x2apic is not in ON or LOCKED state, disable it if already enabled * from BIOS. */ - if (x2apic_state != X2APIC_ON) { + if (x2apic_state < X2APIC_ON) { __x2apic_disable(); return; } @@ -1829,6 +1857,11 @@ static __init void x2apic_disable(void) if (x2apic_id >= 255) panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + if (x2apic_hw_locked()) { + pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id); + return; + } + __x2apic_disable(); register_lapic_address(mp_lapic_addr); } @@ -1889,7 +1922,10 @@ void __init check_x2apic(void) if (x2apic_enabled()) { pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; - x2apic_state = X2APIC_ON; + if (x2apic_hw_locked()) + x2apic_state = X2APIC_ON_LOCKED; + else + x2apic_state = X2APIC_ON; } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } -- Gitee From 35b1931f453eac4ec0a5ee54145597ae81d18cd3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Oct 2022 17:12:19 +0300 Subject: [PATCH 7/8] x86/tdx: Prepare for using "INFO" call for a second purpose ANBZ: #4829 commit a6dd6f39008bb3ef7c73ef0a2acc2a4209555bd8 upstream. The TDG.VP.INFO TDCALL provides the guest with various details about the TDX system that the guest needs to run. Only one field is currently used: 'gpa_width' which tells the guest which PTE bits mark pages shared or private. A second field is now needed: the guest "TD attributes" to tell if virtualization exceptions are configured in a way that can harm the guest. Make the naming and calling convention more generic and discrete from the mask-centric one. Thanks to Sathya for the inspiration here, but there's no code, comments or changelogs left from where he started. Intel-SIG: commit a6dd6f39008b x86/tdx: Prepare for using "INFO" call for a second purpose. Signed-off-by: Dave Hansen Acked-by: Kirill A. Shutemov Tested-by: Kirill A. Shutemov Cc: stable@vger.kernel.org [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/coco/tdx/tdx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 65fa18b7e365..79427fc15675 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -99,7 +99,7 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } -static u64 get_cc_mask(void) +static void tdx_parse_tdinfo(u64 *cc_mask) { struct tdx_module_output out; unsigned int gpa_width; @@ -122,7 +122,7 @@ static u64 get_cc_mask(void) * The highest bit of a guest physical address is the "sharing" bit. * Set it for shared pages and clear it for private pages. */ - return BIT_ULL(gpa_width - 1); + *cc_mask = BIT_ULL(gpa_width - 1); } /* @@ -759,7 +759,7 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); cc_set_vendor(CC_VENDOR_INTEL); - cc_mask = get_cc_mask(); + tdx_parse_tdinfo(&cc_mask); cc_set_mask(cc_mask); if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) -- Gitee From 2b2dd8237235c33cd9418f41262cb9bccbd336d8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 28 Oct 2022 17:12:20 +0300 Subject: [PATCH 8/8] x86/tdx: Panic on bad configs that #VE on "private" memory access ANBZ: #4829 commit 373e715e31bf4e0f129befe87613a278fac228d3 upstream. All normal kernel memory is "TDX private memory". This includes everything from kernel stacks to kernel text. Handling exceptions on arbitrary accesses to kernel memory is essentially impossible because they can happen in horribly nasty places like kernel entry/exit. But, TDX hardware can theoretically _deliver_ a virtualization exception (#VE) on any access to private memory. But, it's not as bad as it sounds. TDX can be configured to never deliver these exceptions on private memory with a "TD attribute" called ATTR_SEPT_VE_DISABLE. The guest has no way to *set* this attribute, but it can check it. Ensure ATTR_SEPT_VE_DISABLE is set in early boot. panic() if it is unset. There is no sane way for Linux to run with this attribute clear so a panic() is appropriate. There's small window during boot before the check where kernel has an early #VE handler. But the handler is only for port I/O and will also panic() as soon as it sees any other #VE, such as a one generated by a private memory access. [ dhansen: Rewrite changelog and rebase on new tdx_parse_tdinfo(). Add Kirill's tested-by because I made changes since he wrote this. ] Intel-SIG: commit 373e715e31bf x86/tdx: Panic on bad configs that Fixes: 9a22bf6debbf ("x86/traps: Add #VE support for TDX guest") Reported-by: ruogui.ygr@alibaba-inc.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Tested-by: Kirill A. Shutemov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20221028141220.29217-3-kirill.shutemov%40linux.intel.com [ zhiminghufighting: amend commit log ] Signed-off-by: zhiminghufighting --- arch/x86/coco/tdx/tdx.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 79427fc15675..9c744484af17 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -35,6 +35,8 @@ #define VE_GET_PORT_NUM(e) ((e) >> 16) #define VE_IS_IO_STRING(e) ((e) & BIT(4)) +#define ATTR_SEPT_VE_DISABLE BIT(28) + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -103,6 +105,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask) { struct tdx_module_output out; unsigned int gpa_width; + u64 td_attr; /* * TDINFO TDX module call is used to get the TD execution environment @@ -110,19 +113,27 @@ static void tdx_parse_tdinfo(u64 *cc_mask) * information, etc. More details about the ABI can be found in TDX * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL * [TDG.VP.INFO]. - * - * The GPA width that comes out of this call is critical. TDX guests - * can not meaningfully run without it. */ tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); - gpa_width = out.rcx & GENMASK(5, 0); - /* * The highest bit of a guest physical address is the "sharing" bit. * Set it for shared pages and clear it for private pages. + * + * The GPA width that comes out of this call is critical. TDX guests + * can not meaningfully run without it. */ + gpa_width = out.rcx & GENMASK(5, 0); *cc_mask = BIT_ULL(gpa_width - 1); + + /* + * The kernel can not handle #VE's when accessing normal kernel + * memory. Ensure that no #VE will be delivered for accesses to + * TD-private memory. Only VMM-shared memory (MMIO) will #VE. + */ + td_attr = out.rdx; + if (!(td_attr & ATTR_SEPT_VE_DISABLE)) + panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n"); } /* -- Gitee