diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5db9f40083934201254b3847eaeda154c3356dde..4d2ab2471fd25cf5068f3eedbc7a224676c0f71e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -876,9 +876,11 @@ config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC + depends on EFI_STUB select ARCH_HAS_CC_PLATFORM select X86_MEM_ENCRYPT select X86_MCE + select UNACCEPTED_MEMORY help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a7229e130f99e5a5b1ac2dab03218fe3672f33a..df87f9106b34061181fa8679e8824b6b049cd266 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,9 +101,11 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o +vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 8bcbcee54aa13703f82bc7e48c02bfdb48351e76..9caf89063e775eb054a857849eb8dd35a0ed9818 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -3,10 +3,9 @@ #include "misc.h" #include "error.h" #include "../string.h" +#include "efi.h" #include -#include -#include /* * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0' @@ -20,153 +19,56 @@ */ struct mem_vector immovable_mem[MAX_NUMNODES*2]; -/* - * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and - * ACPI_TABLE_GUID are found, take the former, which has more features. - */ static acpi_physical_address -__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables, - bool efi_64) +__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) { - acpi_physical_address rsdp_addr = 0; - #ifdef CONFIG_EFI - int i; - - /* Get EFI tables from systab. */ - for (i = 0; i < nr_tables; i++) { - acpi_physical_address table; - efi_guid_t guid; - - if (efi_64) { - efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i; - - guid = tbl->guid; - table = tbl->table; - - if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { - debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); - return 0; - } - } else { - efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i; - - guid = tbl->guid; - table = tbl->table; - } + unsigned long rsdp_addr; + int ret; - if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) - rsdp_addr = table; - else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) - return table; - } + /* + * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to + * ACPI_TABLE_GUID because it has more features. + */ + rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + ACPI_20_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ + rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + ACPI_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + debug_putstr("Error getting RSDP address.\n"); #endif - return rsdp_addr; -} - -/* EFI/kexec support is 64-bit only. */ -#ifdef CONFIG_X86_64 -static struct efi_setup_data *get_kexec_setup_data_addr(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params->hdr.setup_data; - while (pa_data) { - data = (struct setup_data *)pa_data; - if (data->type == SETUP_EFI) - return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); - - pa_data = data->next; - } - return NULL; -} - -static acpi_physical_address kexec_get_rsdp_addr(void) -{ - efi_system_table_64_t *systab; - struct efi_setup_data *esd; - struct efi_info *ei; - char *sig; - - esd = (struct efi_setup_data *)get_kexec_setup_data_addr(); - if (!esd) - return 0; - - if (!esd->tables) { - debug_putstr("Wrong kexec SETUP_EFI data.\n"); - return 0; - } - - ei = &boot_params->efi_info; - sig = (char *)&ei->efi_loader_signature; - if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { - debug_putstr("Wrong kexec EFI loader signature.\n"); - return 0; - } - - /* Get systab from boot params. */ - systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32)); - if (!systab) - error("EFI system table not found in kexec boot_params."); - - return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true); + return 0; } -#else -static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } -#endif /* CONFIG_X86_64 */ static acpi_physical_address efi_get_rsdp_addr(void) { #ifdef CONFIG_EFI - unsigned long systab, config_tables; + unsigned long cfg_tbl_pa = 0; + unsigned int cfg_tbl_len; + unsigned long systab_pa; unsigned int nr_tables; - struct efi_info *ei; - bool efi_64; - char *sig; - - ei = &boot_params->efi_info; - sig = (char *)&ei->efi_loader_signature; - - if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { - efi_64 = true; - } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { - efi_64 = false; - } else { - debug_putstr("Wrong EFI loader signature.\n"); - return 0; - } + enum efi_type et; + int ret; - /* Get systab from boot params. */ -#ifdef CONFIG_X86_64 - systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); -#else - if (ei->efi_systab_hi || ei->efi_memmap_hi) { - debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n"); + et = efi_get_type(boot_params); + if (et == EFI_TYPE_NONE) return 0; - } - systab = ei->efi_systab; -#endif - if (!systab) - error("EFI system table not found."); - /* Handle EFI bitness properly */ - if (efi_64) { - efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab; + systab_pa = efi_get_system_table(boot_params); + if (!systab_pa) + error("EFI support advertised, but unable to locate system table."); - config_tables = stbl->tables; - nr_tables = stbl->nr_tables; - } else { - efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab; + ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len); + if (ret || !cfg_tbl_pa) + error("EFI config table not found."); - config_tables = stbl->tables; - nr_tables = stbl->nr_tables; - } - - if (!config_tables) - error("EFI config tables not found."); - - return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64); + return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len); #else return 0; #endif @@ -256,14 +158,6 @@ acpi_physical_address get_rsdp_addr(void) pa = boot_params->acpi_rsdp_addr; - /* - * Try to get EFI data from setup_data. This can happen when we're a - * kexec'ed kernel and kexec(1) has passed all the required EFI info to - * us. - */ - if (!pa) - pa = kexec_get_rsdp_addr(); - if (!pa) pa = efi_get_rsdp_addr(); diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c new file mode 100644 index 0000000000000000000000000000000000000000..129a21713935f8b3970eb2c40a82f626306ca61d --- /dev/null +++ b/arch/x86/boot/compressed/efi.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for early access to EFI configuration table. + * + * Originally derived from arch/x86/boot/compressed/acpi.c + */ + +#include "misc.h" +#include "efi.h" + +/** + * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. + * + * @bp: pointer to boot_params + * + * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise. + */ +enum efi_type efi_get_type(struct boot_params *bp) +{ + struct efi_info *ei; + enum efi_type et; + const char *sig; + + ei = &bp->efi_info; + sig = (char *)&ei->efi_loader_signature; + + if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_64; + } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_32; + } else { + debug_putstr("No EFI environment detected.\n"); + et = EFI_TYPE_NONE; + } + +#ifndef CONFIG_X86_64 + /* + * Existing callers like acpi.c treat this case as an indicator to + * fall-through to non-EFI, rather than an error, so maintain that + * functionality here as well. + */ + if (ei->efi_systab_hi || ei->efi_memmap_hi) { + debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n"); + et = EFI_TYPE_NONE; + } +#endif + + return et; +} + +/** + * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address + * of the EFI system table. + * + * @bp: pointer to boot_params + * + * Return: EFI system table address on success. On error, return 0. + */ +unsigned long efi_get_system_table(struct boot_params *bp) +{ + unsigned long sys_tbl_pa; + struct efi_info *ei; + enum efi_type et; + + /* Get systab from boot params. */ + ei = &bp->efi_info; +#ifdef CONFIG_X86_64 + sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); +#else + sys_tbl_pa = ei->efi_systab; +#endif + if (!sys_tbl_pa) { + debug_putstr("EFI system table not found."); + return 0; + } + + return sys_tbl_pa; +} + +/* + * EFI config table address changes to virtual address after boot, which may + * not be accessible for the kexec'd kernel. To address this, kexec provides + * the initial physical address via a struct setup_data entry, which is + * checked for here, along with some sanity checks. + */ +static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp, + enum efi_type et) +{ +#ifdef CONFIG_X86_64 + struct efi_setup_data *esd = NULL; + struct setup_data *data; + u64 pa_data; + + pa_data = bp->hdr.setup_data; + while (pa_data) { + data = (struct setup_data *)pa_data; + if (data->type == SETUP_EFI) { + esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); + break; + } + + pa_data = data->next; + } + + /* + * Original ACPI code falls back to attempting normal EFI boot in these + * cases, so maintain existing behavior by indicating non-kexec + * environment to the caller, but print them for debugging. + */ + if (esd && !esd->tables) { + debug_putstr("kexec EFI environment missing valid configuration table.\n"); + return NULL; + } + + return esd; +#endif + return NULL; +} + +/** + * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical + * address of EFI configuration table. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: location to store physical address of config table + * @cfg_tbl_len: location to store number of config table entries + * + * Return: 0 on success. On error, return params are left unchanged. + */ +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + unsigned long sys_tbl_pa; + enum efi_type et; + int ret; + + if (!cfg_tbl_pa || !cfg_tbl_len) + return -EINVAL; + + sys_tbl_pa = efi_get_system_table(bp); + if (!sys_tbl_pa) + return -EINVAL; + + /* Handle EFI bitness properly */ + et = efi_get_type(bp); + if (et == EFI_TYPE_64) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa; + struct efi_setup_data *esd; + + /* kexec provides an alternative EFI conf table, check for it. */ + esd = get_kexec_setup_data(bp, et); + + *cfg_tbl_pa = esd ? esd->tables : stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else if (et == EFI_TYPE_32) { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa; + + *cfg_tbl_pa = stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else { + return -EINVAL; + } + + return 0; +} + +/* Get vendor table address/guid from EFI config table at the given index */ +static int get_vendor_table(void *cfg_tbl, unsigned int idx, + unsigned long *vendor_tbl_pa, + efi_guid_t *vendor_tbl_guid, + enum efi_type et) +{ + if (et == EFI_TYPE_64) { + efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx; + + if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) { + debug_putstr("Error: EFI config table entry located above 4GB.\n"); + return -EINVAL; + } + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + + } else if (et == EFI_TYPE_32) { + efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx; + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + } else { + return -EINVAL; + } + + return 0; +} + +/** + * efi_find_vendor_table - Given EFI config table, search it for the physical + * address of the vendor table associated with GUID. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: pointer to EFI configuration table + * @cfg_tbl_len: number of entries in EFI configuration table + * @guid: GUID of vendor table + * + * Return: vendor table address on success. On error, return 0. + */ +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + enum efi_type et; + unsigned int i; + + et = efi_get_type(bp); + if (et == EFI_TYPE_NONE) + return 0; + + for (i = 0; i < cfg_tbl_len; i++) { + unsigned long vendor_tbl_pa; + efi_guid_t vendor_tbl_guid; + int ret; + + ret = get_vendor_table((void *)cfg_tbl_pa, i, + &vendor_tbl_pa, + &vendor_tbl_guid, et); + if (ret) + return 0; + + if (!efi_guidcmp(guid, vendor_tbl_guid)) + return vendor_tbl_pa; + } + + return 0; +} diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h new file mode 100644 index 0000000000000000000000000000000000000000..7a837ee8c600c430e0fdadb9f5497329fd5a812d --- /dev/null +++ b/arch/x86/boot/compressed/efi.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_EFI_H +#define BOOT_COMPRESSED_EFI_H + +#include + +#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H) +#error Please do not include kernel proper namespace headers +#endif + +typedef guid_t efi_guid_t __aligned(__alignof__(u32)); + +#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ + (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, d } } + +#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) +#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) +#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) + +#define EFI32_LOADER_SIGNATURE "EL32" +#define EFI64_LOADER_SIGNATURE "EL64" + +/* + * Generic EFI table header + */ +typedef struct { + u64 signature; + u32 revision; + u32 headersize; + u32 crc32; + u32 reserved; +} efi_table_hdr_t; + +#define EFI_CONVENTIONAL_MEMORY 7 +#define EFI_UNACCEPTED_MEMORY 15 + +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ +#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ + +#define EFI_PAGE_SHIFT 12 + +typedef struct { + u32 type; + u32 pad; + u64 phys_addr; + u64 virt_addr; + u64 num_pages; + u64 attribute; +} efi_memory_desc_t; + +#define efi_early_memdesc_ptr(map, desc_size, n) \ + (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size))) + +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +struct efi_unaccepted_memory { + u32 version; + u32 unit_size; + u64 phys_base; + u64 size; + unsigned long bitmap[]; +}; + +static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) +{ + return memcmp(&left, &right, sizeof (efi_guid_t)); +} + +#ifdef CONFIG_EFI +bool __pure __efi_soft_reserve_enabled(void); + +static inline bool __pure efi_soft_reserve_enabled(void) +{ + return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) + && __efi_soft_reserve_enabled(); +} +#else +static inline bool efi_soft_reserve_enabled(void) +{ + return false; +} +#endif /* CONFIG_EFI */ +#endif /* BOOT_COMPRESSED_EFI_H */ diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index c881878e56d313f004787c81269e230b05d9bc3a..5313c5cb2b802beed4b062071efaa4c212e4abeb 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -22,3 +22,22 @@ void error(char *m) while (1) asm("hlt"); } + +/* EFI libstub provides vsnprintf() */ +#ifdef CONFIG_EFI_STUB +void panic(const char *fmt, ...) +{ + static char buf[1024]; + va_list args; + int len; + + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + + error(buf); +} +#endif diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 1de5821184f180fd25dc023ae97db311925f6d52..86fe33b937154cab11923ceccfb1bf73cc715d95 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -6,5 +6,6 @@ void warn(char *m); void error(char *m) __noreturn; +void panic(const char *fmt, ...) __noreturn __cold; #endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b92fffbe761fd5d38c86f68770bb71152c62d8c3..d24381afeecf932e60d332ac979e07c7c2ca9ba2 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -22,15 +22,14 @@ #include "misc.h" #include "error.h" #include "../string.h" +#include "efi.h" #include #include #include #include #include -#include #include -#include /* Macros used by the included decompressor code below. */ #define STATIC @@ -676,6 +675,33 @@ static bool process_mem_region(struct mem_vector *region, } #ifdef CONFIG_EFI + +/* + * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are + * guaranteed to be free. + * + * Pick free memory more conservatively than the EFI spec allows: according to + * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus + * available to place the kernel image into, but in practice there's firmware + * where using that memory leads to crashes. Buggy vendor EFI code registers + * for an event that triggers on SetVirtualAddressMap(). The handler assumes + * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which + * is probably true for Windows. + * + * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). + */ +static inline bool memory_type_is_free(efi_memory_desc_t *md) +{ + if (md->type == EFI_CONVENTIONAL_MEMORY) + return true; + + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && + md->type == EFI_UNACCEPTED_MEMORY) + return true; + + return false; +} + /* * Returns true if we processed the EFI memmap, which we prefer over the E820 * table if it is available. @@ -720,18 +746,7 @@ process_efi_entries(unsigned long minimum, unsigned long image_size) for (i = 0; i < nr_desc; i++) { md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); - /* - * Here we are more conservative in picking free memory than - * the EFI spec allows: - * - * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also - * free memory and thus available to place the kernel image into, - * but in practice there's firmware where using that memory leads - * to crashes. - * - * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. - */ - if (md->type != EFI_CONVENTIONAL_MEMORY) + if (!memory_type_is_free(md)) continue; if (efi_soft_reserve_enabled() && diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c new file mode 100644 index 0000000000000000000000000000000000000000..f04b29f3572f696a752a0d1b69ef5e477fc9cf26 --- /dev/null +++ b/arch/x86/boot/compressed/mem.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "error.h" +#include "misc.h" +#include "tdx.h" +#include + +/* + * accept_memory() and process_unaccepted_memory() called from EFI stub which + * runs before decompresser and its early_tdx_detect(). + * + * Enumerate TDX directly from the early users. + */ +static bool early_is_tdx_guest(void) +{ + static bool once; + static bool is_tdx; + + if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) + return false; + + if (!once) { + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, + &sig[0], &sig[2], &sig[1]); + is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig)); + once = true; + } + + return is_tdx; +} + +void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (early_is_tdx_guest()) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else { + error("Cannot accept memory: unknown platform\n"); + } +} + +bool init_unaccepted_memory(void) +{ + guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; + struct efi_unaccepted_memory *table; + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + enum efi_type et; + int ret; + + et = efi_get_type(boot_params); + if (et == EFI_TYPE_NONE) + return false; + + ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len); + if (ret) { + warn("EFI config table not found."); + return false; + } + + table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa, + cfg_table_len, guid); + if (!table) + return false; + + if (table->version != 1) + error("Unknown version of unaccepted memory table\n"); + + /* + * In many cases unaccepted_table is already set by EFI stub, but it + * has to be initialized again to cover cases when the table is not + * allocated by EFI stub or EFI stub copied the kernel image with + * efi_relocate_kernel() before the variable is set. + * + * It must be initialized before the first usage of accept_memory(). + */ + unaccepted_table = table; + + return true; +} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b9d0876edf408188fb3fdf422e5ba39b80766e13..3ef3a59b4858615db06c766d82aeb5b08967fb2f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -449,6 +449,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #endif debug_putstr("\nDecompressing Linux... "); + + if (init_unaccepted_memory()) { + debug_putstr("Accepting memory... "); + accept_memory(__pa(output), __pa(output) + needed_size); + } + __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); parse_elf(output); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index f240f73e674cdf35cefee74df243472e312c0d47..f4df7cfdc7ae0f0011faeb93c70bef215e160092 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -16,6 +16,15 @@ /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 +/* + * Boot stub deals with identity mappings, physical and virtual addresses are + * the same, so override these defines. + * + * will not define them if they are already defined. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + #include #include #include @@ -33,6 +42,8 @@ #include "../ctype.h" #include "../io.h" +#include "efi.h" + #ifdef CONFIG_X86_64 #define memptr long #else @@ -169,4 +180,57 @@ void boot_stage2_vc(void); unsigned long sev_verify_cbit(unsigned long cr3); +enum efi_type { + EFI_TYPE_64, + EFI_TYPE_32, + EFI_TYPE_NONE, +}; + +#ifdef CONFIG_EFI +/* helpers for early EFI config table access */ +enum efi_type efi_get_type(struct boot_params *bp); +unsigned long efi_get_system_table(struct boot_params *bp); +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len); +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid); +#else +static inline enum efi_type efi_get_type(struct boot_params *bp) +{ + return EFI_TYPE_NONE; +} + +static inline unsigned long efi_get_system_table(struct boot_params *bp) +{ + return 0; +} + +static inline int efi_get_conf_table(struct boot_params *bp, + unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + return -ENOENT; +} + +static inline unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + return 0; +} +#endif /* CONFIG_EFI */ + +#ifdef CONFIG_UNACCEPTED_MEMORY +bool init_unaccepted_memory(void); +#else +static inline bool init_unaccepted_memory(void) { return false; } +#endif + +/* Defined in EFI stub */ +extern struct efi_unaccepted_memory *unaccepted_table; +void accept_memory(phys_addr_t start, phys_addr_t end); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 2a78746f5a4c38c65c775d5c46f1f19daec4aa24..bc1a08c10143f66229617d74e1acaa1b53bf165a 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,10 +1,10 @@ -#include #include #include -#include #include "pgtable.h" #include "../string.h" +#include "efi.h" + #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c new file mode 100644 index 0000000000000000000000000000000000000000..5ac43762fe13c17e0c23bdd936f8dc3494e97ee5 --- /dev/null +++ b/arch/x86/boot/compressed/tdx-shared.c @@ -0,0 +1,2 @@ +#include "error.h" +#include "../../coco/tdx/tdx-shared.c" diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index d2db3e6770e50990b2d2472fd748be134b11261f..29e50aa3b97d6b230ff7ec5d49a3771702b54a05 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o tdcall.o attest.o +obj-y += tdx.o tdx-shared.o tdcall.o attest.o diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c new file mode 100644 index 0000000000000000000000000000000000000000..ef20ddc37b58aa2f505e0ef36d0fd8c1b28e0947 --- /dev/null +++ b/arch/x86/coco/tdx/tdx-shared.c @@ -0,0 +1,71 @@ +#include +#include + +static unsigned long try_accept_one(phys_addr_t start, unsigned long len, + enum pg_level pg_level) +{ + unsigned long accept_size = page_level_size(pg_level); + u64 tdcall_rcx; + u8 page_size; + + if (!IS_ALIGNED(start, accept_size)) + return 0; + + if (len < accept_size) + return 0; + + /* + * Pass the page physical address to the TDX module to accept the + * pending, private page. + * + * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. + */ + switch (pg_level) { + case PG_LEVEL_4K: + page_size = 0; + break; + case PG_LEVEL_2M: + page_size = 1; + break; + case PG_LEVEL_1G: + page_size = 2; + break; + default: + return 0; + } + + tdcall_rcx = start | page_size; + if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) + return 0; + + return accept_size; +} + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* + * For shared->private conversion, accept the page using + * TDX_ACCEPT_PAGE TDX module call. + */ + while (start < end) { + unsigned long len = end - start; + unsigned long accept_size; + + /* + * Try larger accepts first. It gives chance to VMM to keep + * 1G/2M Secure EPT entries where possible and speeds up + * process by cutting number of hypercalls (if successful). + */ + + accept_size = try_accept_one(start, len, PG_LEVEL_1G); + if (!accept_size) + accept_size = try_accept_one(start, len, PG_LEVEL_2M); + if (!accept_size) + accept_size = try_accept_one(start, len, PG_LEVEL_4K); + if (!accept_size) + return false; + start += accept_size; + } + + return true; +} diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 78f17d405af5dfd361012f45d4eb107599101512..7b30b3020f50b0af5e4f55c095174360f7c89025 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -19,21 +19,6 @@ #include #include -/* TDX module Call Leaf IDs */ -#define TDX_GET_INFO 1 -#define TDX_GET_VEINFO 3 -#define TDX_GET_REPORT 4 -#define TDX_ACCEPT_PAGE 6 -#define TDX_WR 8 - -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ -#define TDCS_NOTIFY_ENABLES 0x9100000000000010 - -/* TDX hypercall Leaf IDs */ -#define TDVMCALL_MAP_GPA 0x10001 -#define TDVMCALL_SETUP_NOTIFY_INTR 0x10004 -#define TDVMCALL_REPORT_FATAL_ERROR 0x10003 - /* MMIO direction */ #define EPT_READ 0 #define EPT_WRITE 1 @@ -59,24 +44,6 @@ int tdx_notify_irq = -1; #define TDREPORT_SUBTYPE_0 0 -/* - * Wrapper for standard use of __tdx_hypercall with no output aside from - * return code. - */ -static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) -{ - struct tdx_hypercall_args args = { - .r10 = TDX_HYPERCALL_STANDARD, - .r11 = fn, - .r12 = r12, - .r13 = r13, - .r14 = r14, - .r15 = r15, - }; - - return __tdx_hypercall(&args); -} - /* Called from __tdx_hypercall() for unrecoverable failure */ noinstr void __tdx_hypercall_failed(void) { @@ -778,47 +745,6 @@ static bool tdx_cache_flush_required(void) return true; } -static bool try_accept_one(phys_addr_t *start, unsigned long len, - enum pg_level pg_level) -{ - unsigned long accept_size = page_level_size(pg_level); - u64 tdcall_rcx; - u8 page_size; - - if (!IS_ALIGNED(*start, accept_size)) - return false; - - if (len < accept_size) - return false; - - /* - * Pass the page physical address to the TDX module to accept the - * pending, private page. - * - * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. - */ - switch (pg_level) { - case PG_LEVEL_4K: - page_size = 0; - break; - case PG_LEVEL_2M: - page_size = 1; - break; - case PG_LEVEL_1G: - page_size = 2; - break; - default: - return false; - } - - tdcall_rcx = *start | page_size; - if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) - return false; - - *start += accept_size; - return true; -} - /* * Notify the VMM about page mapping conversion. More info about ABI * can be found in TDX Guest-Host-Communication Interface (GHCI), @@ -883,32 +809,9 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) if (!tdx_map_gpa(start, end, enc)) return false; - /* private->shared conversion requires only MapGPA call */ - if (!enc) - return true; - - /* - * For shared->private conversion, accept the page using - * TDX_ACCEPT_PAGE TDX module call. - */ - while (start < end) { - unsigned long len = end - start; - - /* - * Try larger accepts first. It gives chance to VMM to keep - * 1G/2M SEPT entries where possible and speeds up process by - * cutting number of hypercalls (if successful). - */ - - if (try_accept_one(&start, len, PG_LEVEL_1G)) - continue; - - if (try_accept_one(&start, len, PG_LEVEL_2M)) - continue; - - if (!try_accept_one(&start, len, PG_LEVEL_4K)) - return false; - } + /* shared->private conversion requires memory to be accepted before use */ + if (enc) + return tdx_accept_memory(start, end); return true; } diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ea690f793e6d1f6b3ccbfb032cdcf091ed03f846..f02c3ed1f4a3cd4814af039c45c8a08d0bb93840 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -29,6 +29,8 @@ extern unsigned long efi_fw_vendor, efi_config_table; #define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF +#define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE + /* * The EFI services are called through variadic functions in many cases. These * functions are implemented in assembler and support only a fixed number of diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 78c9d32b6dd81a8bc0754ee3f96c569331001374..d8c5d8a3189d9b4f1dcded440cff706f82d29f29 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -12,6 +12,21 @@ #define TDVMCALL_STATUS_RETRY 1 +/* TDX module Call Leaf IDs */ +#define TDX_GET_INFO 1 +#define TDX_GET_VEINFO 3 +#define TDX_GET_REPORT 4 +#define TDX_ACCEPT_PAGE 6 +#define TDX_WR 8 + +/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +#define TDCS_NOTIFY_ENABLES 0x9100000000000010 + +/* TDX hypercall Leaf IDs */ +#define TDVMCALL_MAP_GPA 0x10001 +#define TDVMCALL_REPORT_FATAL_ERROR 0x10003 +#define TDVMCALL_SETUP_NOTIFY_INTR 0x10004 + #ifndef __ASSEMBLY__ /* @@ -39,8 +54,47 @@ struct tdx_hypercall_args { u64 __tdx_hypercall(struct tdx_hypercall_args *args); u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args); +} + + /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); +/* + * Used in __tdx_module_call() to gather the output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI + */ +struct tdx_module_output { + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 r10; + u64 r11; +}; + +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out); + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 16c2e9226c81acd33312371082224f8e3b310924..b6808e33d8571585005b72d6c2d35ce996b6771c 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -4,6 +4,8 @@ #include #include + +#include #include #include @@ -27,21 +29,6 @@ static inline bool platform_has_tdx(void) { return false; } extern enum tdx_module_status_t tdx_module_status; -/* - * Used to gather the output registers values of the TDCALL and SEAMCALL - * instructions when requesting services from the TDX module. - * - * This is a software only structure and not part of the TDX module/VMM ABI. - */ -struct tdx_module_output { - u64 rcx; - u64 rdx; - u64 r8; - u64 r9; - u64 r10; - u64 r11; -}; - /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -62,10 +49,6 @@ struct ve_info { void __init tdx_early_init(void); -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out); - void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h new file mode 100644 index 0000000000000000000000000000000000000000..572514e36fdec8adcf64329554d9d0bed7f14b1b --- /dev/null +++ b/arch/x86/include/asm/unaccepted_memory.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_UNACCEPTED_MEMORY_H +#define _ASM_X86_UNACCEPTED_MEMORY_H + +#include +#include + +static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else { + panic("Cannot accept memory: unknown platform\n"); + } +} + +static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void) +{ + if (efi.unaccepted == EFI_INVALID_TABLE_ADDR) + return NULL; + return __va(efi.unaccepted); +} +#endif diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0ecd965b6b527ba499d3c1be1d1fce..10562885f5fc6108d4f39a246814b5f21bbbe27f 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, - aper_size, aper_size); + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); if (!addr) { pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); return 0; } - memblock_reserve(addr, aper_size); pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 63d8c6c7d125424748e955ef832a6dc4318fb0a3..a11488beaa7d0335e6a1d4716abb7aa2bc7bfd61 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -137,14 +137,12 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { - ret = memblock_find_in_range( + ret = memblock_phys_alloc_range( + PAGE_SIZE * num, PAGE_SIZE, min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE * num , PAGE_SIZE); + max_pfn_mapped << PAGE_SHIFT); } - if (ret) - memblock_reserve(ret, PAGE_SIZE * num); - else if (can_use_brk_pgt) + if (!ret && can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) @@ -611,8 +609,17 @@ static void __init memory_map_top_down(unsigned long map_start, unsigned long addr; unsigned long mapped_ram_size = 0; - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + /* + * Systems that have many reserved areas near top of the memory, + * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will + * require lots of 4K mappings which may exhaust pgt_buf. + * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure + * there is enough mapped memory that can be allocated from + * memblock. + */ + addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, + map_end); + memblock_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e94da744386f3ad552f763bb587dccb65a82eb56..a1b5c71099e61d50095666e18e16e6b921c5d664 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void) cnt++; size = cnt * cnt * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 87d77cc52f86e00f153e3fe5db176625b6a12eae..737491b13728c3ba88146d8ba728abbc2cdd92db 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - phys_size, PAGE_SIZE); + phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ccebb29bc0aba9b77f15c6dd42fb3512bd6e3bbe..34299ec11b0fa0009314cd31df6145c7f6a7d476 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -96,6 +96,9 @@ static const unsigned long * const efi_tables[] = { #ifdef CONFIG_EFI_COCO_SECRET &efi.coco_secret, #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + &efi.unaccepted, +#endif }; u64 efi_setup; /* efi setup_data physical address */ diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index dc472b7e7ad4582217ad491ea73235fdd32ddd5a..497b66a855dc5e440ea7a3b89575b8a87bee7c10 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -54,7 +54,7 @@ void __init reserve_real_mode(void) WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); if (!mem) { pr_info("No sub-1M memory is available for the trampoline\n"); return; diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index ad12ac173a04e8f941a08cb77fcef49a8e79d66a..19714b23f2cfffc7f2c099684d3da563004eb5bb 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -631,8 +631,8 @@ void __init acpi_table_upgrade(void) } acpi_tables_addr = - memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, - all_tables_size, PAGE_SIZE); + memblock_phys_alloc_range(all_tables_size, PAGE_SIZE, + 0, ACPI_TABLE_UPGRADE_MAX_PHYS); if (!acpi_tables_addr) { WARN_ON(1); return; @@ -647,7 +647,6 @@ void __init acpi_table_upgrade(void) * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area) * works fine. */ - memblock_reserve(acpi_tables_addr, all_tables_size); arch_reserve_mem_area(acpi_tables_addr, all_tables_size); /* diff --git a/drivers/base/node.c b/drivers/base/node.c index bb0f4286b33f098673b09615d4d3bebbab36874f..f6f8691c39edce615f5acf4ed0c1afaed3901aea 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -441,6 +441,9 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d FileHugePages: %8lu kB\n" "Node %d FilePmdMapped: %8lu kB\n" #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + "Node %d Unaccepted: %8lu kB\n" +#endif #ifdef CONFIG_DUPTEXT "Node %d DupText: %8lu kB\n" #endif @@ -475,6 +478,10 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_FILE_THPS)), nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED)) #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + , + nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED)) +#endif #ifdef CONFIG_DUPTEXT , nid, K(node_page_state(pgdat, NR_DUPTEXT)) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index eb055439e2566016b06604391122e693b081f3c9..f178750ba3400d67dea4714d0447be96daea7cb3 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -272,6 +272,35 @@ config EFI_DISABLE_PCI_DMA options "efi=disable_early_pci_dma" or "efi=no_disable_early_pci_dma" may be used to override this option. +config EFI_COCO_SECRET + bool "EFI Confidential Computing Secret Area Support" + depends on EFI + help + Confidential Computing platforms (such as AMD SEV) allow the + Guest Owner to securely inject secrets during guest VM launch. + The secrets are placed in a designated EFI reserved memory area. + + In order to use the secrets in the kernel, the location of the secret + area (as published in the EFI config table) must be kept. + + If you say Y here, the address of the EFI secret area will be kept + for usage inside the kernel. This will allow the + virt/coco/efi_secret module to access the secrets, which in turn + allows userspace programs to access the injected secrets. + +config UNACCEPTED_MEMORY + bool "Tdx unaccepted memory support" + depends on EFI_STUB + help + Some Virtual Machine platforms, such as Intel TDX, require + some memory to be "accepted" by the guest before it can be used. + This mechanism helps prevent malicious hosts from making changes + to guest memory. + + UEFI specification v2.9 introduced EFI_UNACCEPTED_MEMORY memory type. + + This option adds support for unaccepted memory and makes such memory + usable by the kernel. endmenu config EFI_EMBEDDED_FIRMWARE @@ -322,18 +351,3 @@ config YITIAN_CPER_RAWDATA including CMN, GIC, SMMU, DDR, etc. It gathers more useful error info from hardware, which helps to debug and test RAS feature. -config EFI_COCO_SECRET - bool "EFI Confidential Computing Secret Area Support" - depends on EFI - help - Confidential Computing platforms (such as AMD SEV) allow the - Guest Owner to securely inject secrets during guest VM launch. - The secrets are placed in a designated EFI reserved memory area. - - In order to use the secrets in the kernel, the location of the secret - area (as published in the EFI config table) must be kept. - - If you say Y here, the address of the EFI secret area will be kept - for usage inside the kernel. This will allow the - virt/coco/efi_secret module to access the secrets, which in turn - allows userspace programs to access the injected secrets. diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index eebc886b461553b206d2d82c22aff3736f917215..d258d808b8243d29f016ce5fe3bd239b0f6d0abd 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -45,3 +45,4 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o obj-$(CONFIG_EFI_EARLYCON) += earlycon.o obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o +obj-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 30a5ab917f4049980ceb2c76c93f110528990efb..427a9213872e5b1b942266484bc8ff17e9bcff2e 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -49,6 +49,9 @@ struct efi __read_mostly efi = { #ifdef CONFIG_EFI_COCO_SECRET .coco_secret = EFI_INVALID_TABLE_ADDR, #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + .unaccepted = EFI_INVALID_TABLE_ADDR, +#endif }; EXPORT_SYMBOL(efi); @@ -536,6 +539,9 @@ static const efi_config_table_type_t common_tables[] __initconst = { #endif #ifdef CONFIG_EFI_COCO_SECRET {LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" }, +#endif +#ifdef CONFIG_UNACCEPTED_MEMORY + {LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID, &efi.unaccepted, "Unaccepted" }, #endif {}, }; @@ -559,6 +565,34 @@ static __init int match_config_table(const efi_guid_t *guid, return 0; } +/** + * reserve_unaccepted - Map and reserve unaccepted configuration table + * @unaccepted: Pointer to unaccepted memory table + * + * memblock_add() makes sure that the table is mapped in direct mapping. During + * normal boot it happens automatically because the table is allocated from + * usable memory. But during crashkernel boot only memory specifically reserved + * for crash scenario is mapped. memblock_add() forces the table to be mapped + * in crashkernel case. + * + * Align the range to the nearest page borders. Ranges smaller than page size + * are not going to be mapped. + * + * memblock_reserve() makes sure that future allocations will not touch the + * table. + */ + +static __init void reserve_unaccepted(struct efi_unaccepted_memory *unaccepted) +{ + phys_addr_t start, size; + + start = PAGE_ALIGN_DOWN(efi.unaccepted); + size = PAGE_ALIGN(sizeof(*unaccepted) + unaccepted->size); + + memblock_add(start, size); + memblock_reserve(start, size); +} + int __init efi_config_parse_tables(const efi_config_table_t *config_tables, int count, const efi_config_table_type_t *arch_tables) @@ -669,6 +703,23 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables, } } + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && + efi.unaccepted != EFI_INVALID_TABLE_ADDR) { + struct efi_unaccepted_memory *unaccepted; + + unaccepted = early_memremap(efi.unaccepted, sizeof(*unaccepted)); + if (unaccepted) { + + if (unaccepted->version == 1) { + reserve_unaccepted(unaccepted); + } else { + efi.unaccepted = EFI_INVALID_TABLE_ADDR; + } + + early_memunmap(unaccepted, sizeof(*unaccepted)); + } + } + return 0; } @@ -755,6 +806,7 @@ static __initdata char memory_type_name[][13] = { "MMIO Port", "PAL Code", "Persistent", + "Unaccepted", }; char * __init efi_md_typeattr_format(char *buf, size_t size, diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index fc5c4f0b14a53af3978e3510ffc9635e7e8f0f57..920c737adb9c6159efd69006fe118021977f58ce 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -77,6 +77,8 @@ CFLAGS_arm32-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) zboot-obj-$(CONFIG_RISCV) := lib-clz_ctz.o lib-ashldi3.o lib-$(CONFIG_EFI_ZBOOT) += zboot.o $(zboot-obj-y) +lib-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o bitmap.o find.o + extra-y := $(lib-y) lib-y := $(patsubst %.o,%.stub.o,$(lib-y)) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 415a971e76947c9c5ccb58618e50f9f42c314616..ac9bb3fb499c99c9f6f8370f52defbde8c9ae881 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -42,26 +42,17 @@ efi_status_t check_platform_features(void) */ static bool check_image_region(u64 base, u64 size) { - unsigned long map_size, desc_size, buff_size; - efi_memory_desc_t *memory_map; - struct efi_boot_memmap map; + struct efi_boot_memmap *map; efi_status_t status; bool ret = false; int map_offset; - map.map = &memory_map; - map.map_size = &map_size; - map.desc_size = &desc_size; - map.desc_ver = NULL; - map.key_ptr = NULL; - map.buff_size = &buff_size; - - status = efi_get_memory_map(&map); + status = efi_get_memory_map(&map, false); if (status != EFI_SUCCESS) return false; - for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { - efi_memory_desc_t *md = (void *)memory_map + map_offset; + for (map_offset = 0; map_offset < map->map_size; map_offset += map->desc_size) { + efi_memory_desc_t *md = (void *)map->map + map_offset; u64 end = md->phys_addr + md->num_pages * EFI_PAGE_SIZE; /* @@ -74,7 +65,7 @@ static bool check_image_region(u64 base, u64 size) } } - efi_bs_call(free_pool, memory_map); + efi_bs_call(free_pool, map); return ret; } diff --git a/drivers/firmware/efi/libstub/bitmap.c b/drivers/firmware/efi/libstub/bitmap.c new file mode 100644 index 0000000000000000000000000000000000000000..5c9bba0d549be20e984d3ed3906aa9f8dd459d62 --- /dev/null +++ b/drivers/firmware/efi/libstub/bitmap.c @@ -0,0 +1,41 @@ +#include + +void __bitmap_set(unsigned long *map, unsigned int start, int len) +{ + unsigned long *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *p |= mask_to_set; + } +} + +void __bitmap_clear(unsigned long *map, unsigned int start, int len) +{ + unsigned long *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 2e8d44c03d325e706bc94df73a2c86b557d25a35..78f78f50fef1aa69e0bee333e1eca1addf23f71a 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -419,7 +419,6 @@ char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len) /** * efi_exit_boot_services() - Exit boot services * @handle: handle of the exiting image - * @map: pointer to receive the memory map * @priv: argument to be passed to @priv_func * @priv_func: function to process the memory map before exiting boot services * @@ -432,26 +431,26 @@ char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len) * * Return: status code */ -efi_status_t efi_exit_boot_services(void *handle, - struct efi_boot_memmap *map, - void *priv, +efi_status_t efi_exit_boot_services(void *handle, void *priv, efi_exit_boot_map_processing priv_func) { + struct efi_boot_memmap *map; efi_status_t status; - status = efi_get_memory_map(map); - + status = efi_get_memory_map(&map, true); if (status != EFI_SUCCESS) - goto fail; + return status; status = priv_func(map, priv); - if (status != EFI_SUCCESS) - goto free_map; + if (status != EFI_SUCCESS) { + efi_bs_call(free_pool, map); + return status; + } if (efi_disable_pci_dma) efi_pci_disable_bridge_busmaster(); - status = efi_bs_call(exit_boot_services, handle, *map->key_ptr); + status = efi_bs_call(exit_boot_services, handle, map->map_key); if (status == EFI_INVALID_PARAMETER) { /* @@ -467,35 +466,26 @@ efi_status_t efi_exit_boot_services(void *handle, * buffer should account for any changes in the map so the call * to get_memory_map() is expected to succeed here. */ - *map->map_size = *map->buff_size; + map->map_size = map->buff_size; status = efi_bs_call(get_memory_map, - map->map_size, - *map->map, - map->key_ptr, - map->desc_size, - map->desc_ver); + &map->map_size, + &map->map, + &map->map_key, + &map->desc_size, + &map->desc_ver); /* exit_boot_services() was called, thus cannot free */ if (status != EFI_SUCCESS) - goto fail; + return status; status = priv_func(map, priv); /* exit_boot_services() was called, thus cannot free */ if (status != EFI_SUCCESS) - goto fail; + return status; - status = efi_bs_call(exit_boot_services, handle, *map->key_ptr); + status = efi_bs_call(exit_boot_services, handle, map->map_key); } - /* exit_boot_services() was called, thus cannot free */ - if (status != EFI_SUCCESS) - goto fail; - - return EFI_SUCCESS; - -free_map: - efi_bs_call(free_pool, *map->map); -fail: return status; } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 8ad849cb30f7345a5f8ee1d4877bf2e20475abe0..da53d75d133e9b7a081f35f88d56a968210812e7 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -328,6 +328,35 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, return status; } +/* + * efi_allocate_virtmap() - create a pool allocation for the virtmap + * + * Create an allocation that is of sufficient size to hold all the memory + * descriptors that will be passed to SetVirtualAddressMap() to inform the + * firmware about the virtual mapping that will be used under the OS to call + * into the firmware. + */ +efi_status_t efi_alloc_virtmap(efi_memory_desc_t **virtmap, + unsigned long *desc_size, u32 *desc_ver) +{ + unsigned long size, mmap_key; + efi_status_t status; + + /* + * Use the size of the current memory map as an upper bound for the + * size of the buffer we need to pass to SetVirtualAddressMap() to + * cover all EFI_MEMORY_RUNTIME regions. + */ + size = 0; + status = efi_bs_call(get_memory_map, &size, NULL, &mmap_key, desc_size, + desc_ver); + if (status != EFI_BUFFER_TOO_SMALL) + return EFI_LOAD_ERROR; + + return efi_bs_call(allocate_pool, EFI_LOADER_DATA, size, + (void **)virtmap); +} + /* * efi_get_virtmap() - create a virtual mapping for the EFI memory map * @@ -343,6 +372,8 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, efi_memory_desc_t *in, *out = runtime_map; int l; + *count = 0; + for (l = 0; l < map_size; l += desc_size) { u64 paddr, size; diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index fc48e8e4c4906ee91cece96d1a913b90f049bd47..3afad09e5be992769ae4ea15926426cb546ba422 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -157,15 +157,6 @@ void efi_set_u64_split(u64 data, u32 *lo, u32 *hi) */ #define EFI_MMAP_NR_SLACK_SLOTS 8 -struct efi_boot_memmap { - efi_memory_desc_t **map; - unsigned long *map_size; - unsigned long *desc_size; - u32 *desc_ver; - unsigned long *key_ptr; - unsigned long *buff_size; -}; - typedef struct efi_generic_dev_path efi_device_path_protocol_t; union efi_device_path_to_text_protocol { @@ -764,9 +755,7 @@ typedef efi_status_t (*efi_exit_boot_map_processing)( struct efi_boot_memmap *map, void *priv); -efi_status_t efi_exit_boot_services(void *handle, - struct efi_boot_memmap *map, - void *priv, +efi_status_t efi_exit_boot_services(void *handle, void *priv, efi_exit_boot_map_processing priv_func); efi_status_t allocate_new_fdt_and_exit_boot(void *handle, @@ -779,6 +768,8 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, void *get_fdt(unsigned long *fdt_size); +efi_status_t efi_alloc_virtmap(efi_memory_desc_t **virtmap, + unsigned long *desc_size, u32 *desc_ver); void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, unsigned long desc_size, efi_memory_desc_t *runtime_map, int *count); @@ -804,7 +795,8 @@ void efi_apply_loadoptions_quirk(const void **load_options, int *load_options_si char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len); -efi_status_t efi_get_memory_map(struct efi_boot_memmap *map); +efi_status_t efi_get_memory_map(struct efi_boot_memmap **map, + bool install_cfg_tbl); efi_status_t efi_allocate_pages(unsigned long size, unsigned long *addr, unsigned long max); @@ -869,4 +861,10 @@ asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint, void efi_handle_post_ebs_state(void); +efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, + struct efi_boot_memmap *map); +void process_unaccepted_memory(u64 start, u64 end); +void accept_memory(phys_addr_t start, phys_addr_t end); +void arch_accept_memory(phys_addr_t start, phys_addr_t end); + #endif diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 368cd60000eec18ec3affe2a199a64631ac81fe4..b2ca6846e0798b7d36c26276ee93c5c74ce38ee1 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -170,25 +170,25 @@ static efi_status_t update_fdt_memmap(void *fdt, struct efi_boot_memmap *map) if (node < 0) return EFI_LOAD_ERROR; - fdt_val64 = cpu_to_fdt64((unsigned long)*map->map); + fdt_val64 = cpu_to_fdt64((unsigned long)map->map); err = fdt_setprop_inplace_var(fdt, node, "linux,uefi-mmap-start", fdt_val64); if (err) return EFI_LOAD_ERROR; - fdt_val32 = cpu_to_fdt32(*map->map_size); + fdt_val32 = cpu_to_fdt32(map->map_size); err = fdt_setprop_inplace_var(fdt, node, "linux,uefi-mmap-size", fdt_val32); if (err) return EFI_LOAD_ERROR; - fdt_val32 = cpu_to_fdt32(*map->desc_size); + fdt_val32 = cpu_to_fdt32(map->desc_size); err = fdt_setprop_inplace_var(fdt, node, "linux,uefi-mmap-desc-size", fdt_val32); if (err) return EFI_LOAD_ERROR; - fdt_val32 = cpu_to_fdt32(*map->desc_ver); + fdt_val32 = cpu_to_fdt32(map->desc_ver); err = fdt_setprop_inplace_var(fdt, node, "linux,uefi-mmap-desc-ver", fdt_val32); if (err) @@ -198,22 +198,25 @@ static efi_status_t update_fdt_memmap(void *fdt, struct efi_boot_memmap *map) } struct exit_boot_struct { + struct efi_boot_memmap *boot_memmap; efi_memory_desc_t *runtime_map; - int *runtime_entry_count; + int runtime_entry_count; void *new_fdt_addr; }; -static efi_status_t exit_boot_func(struct efi_boot_memmap *map, - void *priv) +static efi_status_t exit_boot_func(struct efi_boot_memmap *map, void *priv) { struct exit_boot_struct *p = priv; + + p->boot_memmap = map; + /* * Update the memory map with virtual addresses. The function will also * populate @runtime_map with copies of just the EFI_MEMORY_RUNTIME * entries so that we can pass it straight to SetVirtualAddressMap() */ - efi_get_virtmap(*map->map, *map->map_size, *map->desc_size, - p->runtime_map, p->runtime_entry_count); + efi_get_virtmap(map->map, map->map_size, map->desc_size, + p->runtime_map, &p->runtime_entry_count); return update_fdt_memmap(p->new_fdt_addr, map); } @@ -244,51 +247,28 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, unsigned long fdt_addr, unsigned long fdt_size) { - unsigned long map_size, desc_size, buff_size; + unsigned long desc_size; u32 desc_ver; - unsigned long mmap_key; - efi_memory_desc_t *memory_map, *runtime_map; efi_status_t status; - int runtime_entry_count; - struct efi_boot_memmap map; struct exit_boot_struct priv; - map.map = &runtime_map; - map.map_size = &map_size; - map.desc_size = &desc_size; - map.desc_ver = &desc_ver; - map.key_ptr = &mmap_key; - map.buff_size = &buff_size; - - /* - * Get a copy of the current memory map that we will use to prepare - * the input for SetVirtualAddressMap(). We don't have to worry about - * subsequent allocations adding entries, since they could not affect - * the number of EFI_MEMORY_RUNTIME regions. - */ - status = efi_get_memory_map(&map); - if (status != EFI_SUCCESS) { - efi_err("Unable to retrieve UEFI memory map.\n"); - return status; + if (!efi_novamap) { + status = efi_alloc_virtmap(&priv.runtime_map, &desc_size, + &desc_ver); + if (status != EFI_SUCCESS) { + efi_err("Unable to retrieve UEFI memory map.\n"); + return status; + } } efi_info("Exiting boot services and installing virtual address map...\n"); - map.map = &memory_map; status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, max_addr); if (status != EFI_SUCCESS) { efi_err("Unable to allocate memory for new device tree.\n"); goto fail; } - /* - * Now that we have done our final memory allocation (and free) - * we can get the memory map key needed for exit_boot_services(). - */ - status = efi_get_memory_map(&map); - if (status != EFI_SUCCESS) - goto fail_free_new_fdt; - status = update_fdt((void *)fdt_addr, fdt_size, (void *)*new_fdt_addr, MAX_FDT_SIZE, cmdline_ptr, initrd_addr, initrd_size); @@ -298,12 +278,9 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, goto fail_free_new_fdt; } - runtime_entry_count = 0; - priv.runtime_map = runtime_map; - priv.runtime_entry_count = &runtime_entry_count; - priv.new_fdt_addr = (void *)*new_fdt_addr; + priv.new_fdt_addr = (void *)*new_fdt_addr; - status = efi_exit_boot_services(handle, &map, &priv, exit_boot_func); + status = efi_exit_boot_services(handle, &priv, exit_boot_func); if (status == EFI_SUCCESS) { efi_set_virtual_address_map_t *svam; @@ -313,8 +290,8 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, /* Install the new virtual address map */ svam = efi_system_table->runtime->set_virtual_address_map; - status = svam(runtime_entry_count * desc_size, desc_size, - desc_ver, runtime_map); + status = svam(priv.runtime_entry_count * desc_size, desc_size, + desc_ver, priv.runtime_map); /* * We are beyond the point of no return here, so if the call to @@ -322,6 +299,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, * incoming kernel but proceed normally otherwise. */ if (status != EFI_SUCCESS) { + efi_memory_desc_t *p; int l; /* @@ -330,8 +308,9 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, * the incoming kernel that no virtual translation has * been installed. */ - for (l = 0; l < map_size; l += desc_size) { - efi_memory_desc_t *p = (void *)memory_map + l; + for (l = 0; l < priv.boot_memmap->map_size; + l += priv.boot_memmap->desc_size) { + p = (void *)priv.boot_memmap->map + l; if (p->attribute & EFI_MEMORY_RUNTIME) p->virt_addr = 0; @@ -346,7 +325,8 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, efi_free(MAX_FDT_SIZE, *new_fdt_addr); fail: - efi_system_table->boottime->free_pool(runtime_map); + if (!efi_novamap) + efi_bs_call(free_pool, priv.runtime_map); return EFI_LOAD_ERROR; } diff --git a/drivers/firmware/efi/libstub/find.c b/drivers/firmware/efi/libstub/find.c new file mode 100644 index 0000000000000000000000000000000000000000..9405825f40aafa0c60faeb985dae9384a5fc3bfa --- /dev/null +++ b/drivers/firmware/efi/libstub/find.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +/* + * Common helper for find_next_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + */ +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ +({ \ + unsigned long mask, idx, tmp, sz = (size), __start = (start); \ + \ + if (unlikely(__start >= sz)) \ + goto out; \ + \ + mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ + idx = __start / BITS_PER_LONG; \ + \ + for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ + if ((idx + 1) * BITS_PER_LONG >= sz) \ + goto out; \ + idx++; \ + } \ + \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ +out: \ + sz; \ +}) + +unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); +} + +unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, + unsigned long start) +{ + return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); +} + +#ifndef find_next_bit +/* + * Find the next set bit in a memory region. + */ +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, size, offset); +} +#endif + +#ifndef find_next_zero_bit +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, size, offset); +} +#endif diff --git a/drivers/firmware/efi/libstub/mem.c b/drivers/firmware/efi/libstub/mem.c index feef8d4be113b653d8216347ed81e7beaf55396e..45841ef55a9f6ba284ec7f4bd34fc24a6b91451e 100644 --- a/drivers/firmware/efi/libstub/mem.c +++ b/drivers/firmware/efi/libstub/mem.c @@ -5,71 +5,66 @@ #include "efistub.h" -static inline bool mmap_has_headroom(unsigned long buff_size, - unsigned long map_size, - unsigned long desc_size) -{ - unsigned long slack = buff_size - map_size; - - return slack / desc_size >= EFI_MMAP_NR_SLACK_SLOTS; -} - /** * efi_get_memory_map() - get memory map - * @map: on return pointer to memory map + * @map: pointer to memory map pointer to which to assign the + * newly allocated memory map + * @install_cfg_tbl: whether or not to install the boot memory map as a + * configuration table * * Retrieve the UEFI memory map. The allocated memory leaves room for * up to EFI_MMAP_NR_SLACK_SLOTS additional memory map entries. * * Return: status code */ -efi_status_t efi_get_memory_map(struct efi_boot_memmap *map) +efi_status_t efi_get_memory_map(struct efi_boot_memmap **map, + bool install_cfg_tbl) { - efi_memory_desc_t *m = NULL; + int memtype = install_cfg_tbl ? EFI_ACPI_RECLAIM_MEMORY + : EFI_LOADER_DATA; + efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID; + struct efi_boot_memmap *m, tmp; efi_status_t status; - unsigned long key; - u32 desc_version; - - *map->desc_size = sizeof(*m); - *map->map_size = *map->desc_size * 32; - *map->buff_size = *map->map_size; -again: - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, - *map->map_size, (void **)&m); + unsigned long size; + + tmp.map_size = 0; + status = efi_bs_call(get_memory_map, &tmp.map_size, NULL, &tmp.map_key, + &tmp.desc_size, &tmp.desc_ver); + if (status != EFI_BUFFER_TOO_SMALL) + return EFI_LOAD_ERROR; + + size = tmp.map_size + tmp.desc_size * EFI_MMAP_NR_SLACK_SLOTS; + status = efi_bs_call(allocate_pool, memtype, sizeof(*m) + size, + (void **)&m); if (status != EFI_SUCCESS) - goto fail; - - *map->desc_size = 0; - key = 0; - status = efi_bs_call(get_memory_map, map->map_size, m, - &key, map->desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL || - !mmap_has_headroom(*map->buff_size, *map->map_size, - *map->desc_size)) { - efi_bs_call(free_pool, m); + return status; + + if (install_cfg_tbl) { /* - * Make sure there is some entries of headroom so that the - * buffer can be reused for a new map after allocations are - * no longer permitted. Its unlikely that the map will grow to - * exceed this headroom once we are ready to trigger - * ExitBootServices() + * Installing a configuration table might allocate memory, and + * this may modify the memory map. This means we should install + * the configuration table first, and re-install or delete it + * as needed. */ - *map->map_size += *map->desc_size * EFI_MMAP_NR_SLACK_SLOTS; - *map->buff_size = *map->map_size; - goto again; + status = efi_bs_call(install_configuration_table, &tbl_guid, m); + if (status != EFI_SUCCESS) + goto free_map; } - if (status == EFI_SUCCESS) { - if (map->key_ptr) - *map->key_ptr = key; - if (map->desc_ver) - *map->desc_ver = desc_version; - } else { - efi_bs_call(free_pool, m); - } + m->buff_size = m->map_size = size; + status = efi_bs_call(get_memory_map, &m->map_size, m->map, &m->map_key, + &m->desc_size, &m->desc_ver); + if (status != EFI_SUCCESS) + goto uninstall_table; + + *map = m; + return EFI_SUCCESS; -fail: - *map->map = m; +uninstall_table: + if (install_cfg_tbl) + efi_bs_call(install_configuration_table, &tbl_guid, NULL); +free_map: + efi_bs_call(free_pool, m); return status; } diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 724155b9e10dcf84a44a836efed036574daaa728..0d3b6607550840b977ca66060e2f860587288b6e 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -55,21 +55,13 @@ efi_status_t efi_random_alloc(unsigned long size, unsigned long *addr, unsigned long random_seed) { - unsigned long map_size, desc_size, total_slots = 0, target_slot; - unsigned long buff_size; + unsigned long total_slots = 0, target_slot; + unsigned long total_mirrored_slots = 0; + struct efi_boot_memmap *map; efi_status_t status; - efi_memory_desc_t *memory_map; int map_offset; - struct efi_boot_memmap map; - map.map = &memory_map; - map.map_size = &map_size; - map.desc_size = &desc_size; - map.desc_ver = NULL; - map.key_ptr = NULL; - map.buff_size = &buff_size; - - status = efi_get_memory_map(&map); + status = efi_get_memory_map(&map, false); if (status != EFI_SUCCESS) return status; @@ -79,8 +71,8 @@ efi_status_t efi_random_alloc(unsigned long size, size = round_up(size, EFI_ALLOC_ALIGN); /* count the suitable slots in each memory map entry */ - for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { - efi_memory_desc_t *md = (void *)memory_map + map_offset; + for (map_offset = 0; map_offset < map->map_size; map_offset += map->desc_size) { + efi_memory_desc_t *md = (void *)map->map + map_offset; unsigned long slots; slots = get_entry_num_slots(md, size, ilog2(align)); @@ -102,8 +94,8 @@ efi_status_t efi_random_alloc(unsigned long size, * to calculate the randomly chosen address, and allocate it directly * using EFI_ALLOCATE_ADDRESS. */ - for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { - efi_memory_desc_t *md = (void *)memory_map + map_offset; + for (map_offset = 0; map_offset < map->map_size; map_offset += map->desc_size) { + efi_memory_desc_t *md = (void *)map->map + map_offset; efi_physical_addr_t target; unsigned long pages; @@ -122,7 +114,7 @@ efi_status_t efi_random_alloc(unsigned long size, break; } - efi_bs_call(free_pool, memory_map); + efi_bs_call(free_pool, map); return status; } diff --git a/drivers/firmware/efi/libstub/relocate.c b/drivers/firmware/efi/libstub/relocate.c index 8ee9eb2b90392d993431bba9972923cc4c293683..bf6fbd5d22a1a53af196be2a15f5593468f78da4 100644 --- a/drivers/firmware/efi/libstub/relocate.c +++ b/drivers/firmware/efi/libstub/relocate.c @@ -23,21 +23,12 @@ efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, unsigned long *addr, unsigned long min) { - unsigned long map_size, desc_size, buff_size; - efi_memory_desc_t *map; + struct efi_boot_memmap *map; efi_status_t status; unsigned long nr_pages; int i; - struct efi_boot_memmap boot_map; - boot_map.map = ↦ - boot_map.map_size = &map_size; - boot_map.desc_size = &desc_size; - boot_map.desc_ver = NULL; - boot_map.key_ptr = NULL; - boot_map.buff_size = &buff_size; - - status = efi_get_memory_map(&boot_map); + status = efi_get_memory_map(&map, false); if (status != EFI_SUCCESS) goto fail; @@ -52,12 +43,12 @@ efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, size = round_up(size, EFI_ALLOC_ALIGN); nr_pages = size / EFI_PAGE_SIZE; - for (i = 0; i < map_size / desc_size; i++) { + for (i = 0; i < map->map_size / map->desc_size; i++) { efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map; + unsigned long m = (unsigned long)map->map; u64 start, end; - desc = efi_early_memdesc_ptr(m, desc_size, i); + desc = efi_early_memdesc_ptr(m, map->desc_size, i); if (desc->type != EFI_CONVENTIONAL_MEMORY) continue; @@ -87,7 +78,7 @@ efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, } } - if (i == map_size / desc_size) + if (i == map->map_size / map->desc_size) status = EFI_NOT_FOUND; efi_bs_call(free_pool, map); diff --git a/drivers/firmware/efi/libstub/unaccepted_memory.c b/drivers/firmware/efi/libstub/unaccepted_memory.c new file mode 100644 index 0000000000000000000000000000000000000000..1c98d9e0f4eac9651d188d9c093f1f1a955f09bb --- /dev/null +++ b/drivers/firmware/efi/libstub/unaccepted_memory.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include "efistub.h" + +struct efi_unaccepted_memory *unaccepted_table; + +extern +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset); +extern +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset); +#ifndef for_each_set_bitrange_from +/** + * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange_from(b, e, addr, size) \ + for (; \ + (b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1), \ + (b) < (size); \ + (b) = (e) + 1) +#endif + +efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, + struct efi_boot_memmap *map) +{ + efi_guid_t unaccepted_table_guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; + u64 unaccepted_start = ULLONG_MAX, unaccepted_end = 0, bitmap_size; + efi_status_t status; + int i; + + /* Check if the table is already installed */ + unaccepted_table = get_efi_config_table(unaccepted_table_guid); + if (unaccepted_table) { + if (unaccepted_table->version != 1) { + efi_err("Unknown version of unaccepted memory table\n"); + return EFI_UNSUPPORTED; + } + return EFI_SUCCESS; + } + + /* Check if there's any unaccepted memory and find the max address */ + for (i = 0; i < nr_desc; i++) { + efi_memory_desc_t *d; + unsigned long m = (unsigned long)map->map; + + d = efi_early_memdesc_ptr(m, map->desc_size, i); + if (d->type != EFI_UNACCEPTED_MEMORY) + continue; + + unaccepted_start = min(unaccepted_start, d->phys_addr); + unaccepted_end = max(unaccepted_end, + d->phys_addr + d->num_pages * PAGE_SIZE); + } + + if (unaccepted_start == ULLONG_MAX) + return EFI_SUCCESS; + + unaccepted_start = round_down(unaccepted_start, + EFI_UNACCEPTED_UNIT_SIZE); + unaccepted_end = round_up(unaccepted_end, EFI_UNACCEPTED_UNIT_SIZE); + + /* + * If unaccepted memory is present, allocate a bitmap to track what + * memory has to be accepted before access. + * + * One bit in the bitmap represents 2MiB in the address space: + * A 4k bitmap can track 64GiB of physical address space. + * + * In the worst case scenario -- a huge hole in the middle of the + * address space -- It needs 256MiB to handle 4PiB of the address + * space. + * + * The bitmap will be populated in setup_e820() according to the memory + * map after efi_exit_boot_services(). + */ + bitmap_size = DIV_ROUND_UP(unaccepted_end - unaccepted_start, + EFI_UNACCEPTED_UNIT_SIZE * BITS_PER_BYTE); + + status = efi_bs_call(allocate_pool, EFI_ACPI_RECLAIM_MEMORY, + sizeof(*unaccepted_table) + bitmap_size, + (void **)&unaccepted_table); + if (status != EFI_SUCCESS) { + efi_err("Failed to allocate unaccepted memory config table\n"); + return status; + } + + unaccepted_table->version = 1; + unaccepted_table->unit_size = EFI_UNACCEPTED_UNIT_SIZE; + unaccepted_table->phys_base = unaccepted_start; + unaccepted_table->size = bitmap_size; + memset(unaccepted_table->bitmap, 0, bitmap_size); + + status = efi_bs_call(install_configuration_table, + &unaccepted_table_guid, unaccepted_table); + if (status != EFI_SUCCESS) { + efi_bs_call(free_pool, unaccepted_table); + efi_err("Failed to install unaccepted memory config table!\n"); + } + + return status; +} + +/* + * The accepted memory bitmap only works at unit_size granularity. Take + * unaligned start/end addresses and either: + * 1. Accepts the memory immediately and in its entirety + * 2. Accepts unaligned parts, and marks *some* aligned part unaccepted + * + * The function will never reach the bitmap_set() with zero bits to set. + */ +void process_unaccepted_memory(u64 start, u64 end) +{ + u64 unit_size = unaccepted_table->unit_size; + u64 unit_mask = unaccepted_table->unit_size - 1; + u64 bitmap_size = unaccepted_table->size; + + /* + * Ensure that at least one bit will be set in the bitmap by + * immediately accepting all regions under 2*unit_size. This is + * imprecise and may immediately accept some areas that could + * have been represented in the bitmap. But, results in simpler + * code below + * + * Consider case like this (assuming unit_size == 2MB): + * + * | 4k | 2044k | 2048k | + * ^ 0x0 ^ 2MB ^ 4MB + * + * Only the first 4k has been accepted. The 0MB->2MB region can not be + * represented in the bitmap. The 2MB->4MB region can be represented in + * the bitmap. But, the 0MB->4MB region is <2*unit_size and will be + * immediately accepted in its entirety. + */ + if (end - start < 2 * unit_size) { + arch_accept_memory(start, end); + return; + } + + /* + * No matter how the start and end are aligned, at least one unaccepted + * unit_size area will remain to be marked in the bitmap. + */ + + /* Immediately accept a phys_base) { + arch_accept_memory(start, + min(unaccepted_table->phys_base, end)); + start = unaccepted_table->phys_base; + } + + /* Nothing to record */ + if (end < unaccepted_table->phys_base) + return; + + /* Translate to offsets from the beginning of the bitmap */ + start -= unaccepted_table->phys_base; + end -= unaccepted_table->phys_base; + + /* Accept memory that doesn't fit into bitmap */ + if (end > bitmap_size * unit_size * BITS_PER_BYTE) { + unsigned long phys_start, phys_end; + + phys_start = bitmap_size * unit_size * BITS_PER_BYTE + + unaccepted_table->phys_base; + phys_end = end + unaccepted_table->phys_base; + + arch_accept_memory(phys_start, phys_end); + end = bitmap_size * unit_size * BITS_PER_BYTE; + } + + /* + * 'start' and 'end' are now both unit_size-aligned. + * Record the range as being unaccepted: + */ + bitmap_set(unaccepted_table->bitmap, + start / unit_size, (end - start) / unit_size); +} + +void accept_memory(phys_addr_t start, phys_addr_t end) +{ + unsigned long range_start, range_end; + unsigned long bitmap_size; + u64 unit_size; + + if (!unaccepted_table) + return; + + unit_size = unaccepted_table->unit_size; + + /* + * Only care for the part of the range that is represented + * in the bitmap. + */ + if (start < unaccepted_table->phys_base) + start = unaccepted_table->phys_base; + if (end < unaccepted_table->phys_base) + return; + + /* Translate to offsets from the beginning of the bitmap */ + start -= unaccepted_table->phys_base; + end -= unaccepted_table->phys_base; + + /* Make sure not to overrun the bitmap */ + if (end > unaccepted_table->size * unit_size * BITS_PER_BYTE) + end = unaccepted_table->size * unit_size * BITS_PER_BYTE; + + range_start = start / unit_size; + bitmap_size = DIV_ROUND_UP(end, unit_size); + + for_each_set_bitrange_from(range_start, range_end, + unaccepted_table->bitmap, bitmap_size) { + unsigned long phys_start, phys_end; + + phys_start = range_start * unit_size + unaccepted_table->phys_base; + phys_end = range_end * unit_size + unaccepted_table->phys_base; + + arch_accept_memory(phys_start, phys_end); + bitmap_clear(unaccepted_table->bitmap, + range_start, range_end - range_start); + } +} diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 3672539cb96eb3043d66adb12122d0502e2ba554..75093967e848b877f4974670f1c839d707c6fc31 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -504,6 +504,16 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s e820_type = E820_TYPE_PMEM; break; + case EFI_UNACCEPTED_MEMORY: + if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY)) { + efi_warn_once( +"The system has unaccepted memory, but kernel does not support it\nConsider enabling CONFIG_UNACCEPTED_MEMORY\n"); + continue; + } + e820_type = E820_TYPE_RAM; + process_unaccepted_memory(d->phys_addr, + d->phys_addr + PAGE_SIZE * d->num_pages); + break; default: continue; } @@ -572,28 +582,27 @@ static efi_status_t allocate_e820(struct boot_params *params, struct setup_data **e820ext, u32 *e820ext_size) { - unsigned long map_size, desc_size, map_key; + struct efi_boot_memmap *map; efi_status_t status; - __u32 nr_desc, desc_version; - - /* Only need the size of the mem map and size of each mem descriptor */ - map_size = 0; - status = efi_bs_call(get_memory_map, &map_size, NULL, &map_key, - &desc_size, &desc_version); - if (status != EFI_BUFFER_TOO_SMALL) - return (status != EFI_SUCCESS) ? status : EFI_UNSUPPORTED; + __u32 nr_desc; - nr_desc = map_size / desc_size + EFI_MMAP_NR_SLACK_SLOTS; + status = efi_get_memory_map(&map, false); + if (status != EFI_SUCCESS) + return status; - if (nr_desc > ARRAY_SIZE(params->e820_table)) { - u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table); + nr_desc = map->map_size / map->desc_size; + if (nr_desc > ARRAY_SIZE(params->e820_table) - EFI_MMAP_NR_SLACK_SLOTS) { + u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table) + + EFI_MMAP_NR_SLACK_SLOTS; status = alloc_e820ext(nr_e820ext, e820ext, e820ext_size); - if (status != EFI_SUCCESS) - return status; } - return EFI_SUCCESS; + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && status == EFI_SUCCESS) + status = allocate_unaccepted_bitmap(nr_desc, map); + + efi_bs_call(free_pool, map); + return status; } struct exit_boot_struct { @@ -613,32 +622,22 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map, efi_set_u64_split((unsigned long)efi_system_table, &p->efi->efi_systab, &p->efi->efi_systab_hi); - p->efi->efi_memdesc_size = *map->desc_size; - p->efi->efi_memdesc_version = *map->desc_ver; - efi_set_u64_split((unsigned long)*map->map, + p->efi->efi_memdesc_size = map->desc_size; + p->efi->efi_memdesc_version = map->desc_ver; + efi_set_u64_split((unsigned long)map->map, &p->efi->efi_memmap, &p->efi->efi_memmap_hi); - p->efi->efi_memmap_size = *map->map_size; + p->efi->efi_memmap_size = map->map_size; return EFI_SUCCESS; } static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) { - unsigned long map_sz, key, desc_size, buff_size; - efi_memory_desc_t *mem_map; struct setup_data *e820ext = NULL; __u32 e820ext_size = 0; efi_status_t status; - __u32 desc_version; - struct efi_boot_memmap map; struct exit_boot_struct priv; - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; priv.boot_params = boot_params; priv.efi = &boot_params->efi_info; @@ -647,7 +646,7 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) return status; /* Might as well exit boot services now */ - status = efi_exit_boot_services(handle, &map, &priv, exit_boot_func); + status = efi_exit_boot_services(handle, &priv, exit_boot_func); if (status != EFI_SUCCESS) return status; diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c new file mode 100644 index 0000000000000000000000000000000000000000..135278ddaf627bb1fd41ba6062a3596e52bd8f72 --- /dev/null +++ b/drivers/firmware/efi/unaccepted_memory.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +/* Protects unaccepted memory bitmap and accepting_list */ +static DEFINE_SPINLOCK(unaccepted_memory_lock); + +struct accept_range { + struct list_head list; + unsigned long start; + unsigned long end; +}; + +static LIST_HEAD(accepting_list); + +/* + * accept_memory() -- Consult bitmap and accept the memory if needed. + * + * Only memory that is explicitly marked as unaccepted in the bitmap requires + * an action. All the remaining memory is implicitly accepted and doesn't need + * acceptance. + * + * No need to accept: + * - anything if the system has no unaccepted table; + * - memory that is below phys_base; + * - memory that is above the memory that addressable by the bitmap; + */ +void accept_memory(phys_addr_t start, phys_addr_t end) +{ + struct efi_unaccepted_memory *unaccepted; + unsigned long range_start, range_end; + struct accept_range range, *entry; + unsigned long flags; + u64 unit_size; + + unaccepted = efi_get_unaccepted_table(); + if (!unaccepted) + return; + + unit_size = unaccepted->unit_size; + + /* + * Only care for the part of the range that is represented + * in the bitmap. + */ + if (start < unaccepted->phys_base) + start = unaccepted->phys_base; + if (end < unaccepted->phys_base) + return; + + /* Translate to offsets from the beginning of the bitmap */ + start -= unaccepted->phys_base; + end -= unaccepted->phys_base; + + /* + * load_unaligned_zeropad() can lead to unwanted loads across page + * boundaries. The unwanted loads are typically harmless. But, they + * might be made to totally unrelated or even unmapped memory. + * load_unaligned_zeropad() relies on exception fixup (#PF, #GP and now + * #VE) to recover from these unwanted loads. + * + * But, this approach does not work for unaccepted memory. For TDX, a + * load from unaccepted memory will not lead to a recoverable exception + * within the guest. The guest will exit to the VMM where the only + * recourse is to terminate the guest. + * + * There are two parts to fix this issue and comprehensively avoid + * access to unaccepted memory. Together these ensure that an extra + * "guard" page is accepted in addition to the memory that needs to be + * used: + * + * 1. Implicitly extend the range_contains_unaccepted_memory(start, end) + * checks up to end+unit_size if 'end' is aligned on a unit_size + * boundary. + * + * 2. Implicitly extend accept_memory(start, end) to end+unit_size if + * 'end' is aligned on a unit_size boundary. (immediately following + * this comment) + */ + if (!(end % unit_size)) + end += unit_size; + + /* Make sure not to overrun the bitmap */ + if (end > unaccepted->size * unit_size * BITS_PER_BYTE) + end = unaccepted->size * unit_size * BITS_PER_BYTE; + + range.start = start / unit_size; + range.end = DIV_ROUND_UP(end, unit_size); +retry: + spin_lock_irqsave(&unaccepted_memory_lock, flags); + + /* + * Check if anybody works on accepting the same range of the memory. + * + * The check is done with unit_size granularity. It is crucial to catch + * all accept requests to the same unit_size block, even if they don't + * overlap on physical address level. + */ + list_for_each_entry(entry, &accepting_list, list) { + if (entry->end < range.start) + continue; + if (entry->start >= range.end) + continue; + + /* + * Somebody else accepting the range. Or at least part of it. + * + * Drop the lock and retry until it is complete. + */ + spin_unlock_irqrestore(&unaccepted_memory_lock, flags); + goto retry; + } + + /* + * Register that the range is about to be accepted. + * Make sure nobody else will accept it. + */ + list_add(&range.list, &accepting_list); + + range_start = range.start; + for_each_set_bitrange_from(range_start, range_end, unaccepted->bitmap, + range.end) { + unsigned long phys_start, phys_end; + unsigned long len = range_end - range_start; + + phys_start = range_start * unit_size + unaccepted->phys_base; + phys_end = range_end * unit_size + unaccepted->phys_base; + + /* + * Keep interrupts disabled until the accept operation is + * complete in order to prevent deadlocks. + * + * Enabling interrupts before calling arch_accept_memory() + * creates an opportunity for an interrupt handler to request + * acceptance for the same memory. The handler will continuously + * spin with interrupts disabled, preventing other task from + * making progress with the acceptance process. + */ + spin_unlock(&unaccepted_memory_lock); + + arch_accept_memory(phys_start, phys_end); + + spin_lock(&unaccepted_memory_lock); + bitmap_clear(unaccepted->bitmap, range_start, len); + } + + list_del(&range.list); + spin_unlock_irqrestore(&unaccepted_memory_lock, flags); +} + +bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) +{ + struct efi_unaccepted_memory *unaccepted; + unsigned long flags; + bool ret = false; + u64 unit_size; + + unaccepted = efi_get_unaccepted_table(); + if (!unaccepted) + return false; + + unit_size = unaccepted->unit_size; + + /* + * Only care for the part of the range that is represented + * in the bitmap. + */ + if (start < unaccepted->phys_base) + start = unaccepted->phys_base; + if (end < unaccepted->phys_base) + return false; + + /* Translate to offsets from the beginning of the bitmap */ + start -= unaccepted->phys_base; + end -= unaccepted->phys_base; + + /* + * Also consider the unaccepted state of the *next* page. See fix #1 in + * the comment on load_unaligned_zeropad() in accept_memory(). + */ + if (!(end % unit_size)) + end += unit_size; + + /* Make sure not to overrun the bitmap */ + if (end > unaccepted->size * unit_size * BITS_PER_BYTE) + end = unaccepted->size * unit_size * BITS_PER_BYTE; + + spin_lock_irqsave(&unaccepted_memory_lock, flags); + while (start < end) { + if (test_bit(start / unit_size, unaccepted->bitmap)) { + ret = true; + break; + } + + start += unit_size; + } + spin_unlock_irqrestore(&unaccepted_memory_lock, flags); + + return ret; +} diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 6c95bbdf9265a906b71b11622c7dbf6c41ad6504..58cc6134b6feff5eb85a541009cf73e719ef94e1 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -31,18 +31,22 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, phys_addr_t *res_base) { phys_addr_t base; + int err = 0; end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end; align = !align ? SMP_CACHE_BYTES : align; - base = memblock_find_in_range(start, end, size, align); + base = memblock_phys_alloc_range(size, align, start, end); if (!base) return -ENOMEM; *res_base = base; - if (nomap) - return memblock_remove(base, size); + if (nomap) { + err = memblock_mark_nomap(base, size); + if (err) + memblock_free(base, size); + } - return memblock_reserve(base, size); + return err; } /** diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 0dcda1b4bbaee1924d2723925ad28aff8616d2eb..1077cf68d8323009d62644a560c8f6d199c0c803 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -188,6 +188,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_ZEROED_PAGES)); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + show_val_kb(m, "Unaccepted: ", + global_zone_page_state(NR_UNACCEPTED)); +#endif + hugetlb_report_meminfo(m); arch_report_meminfo(m); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 5b74bdf159d6f202adfaaeb9aca60aa764319600..c281a9067170404acf1a8a8972849773ed809049 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -50,6 +50,20 @@ extern unsigned long __sw_hweight64(__u64 w); (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) +/** + * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange_from(b, e, addr, size) \ + for (; \ + (b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1), \ + (b) < (size); \ + (b) = (e) + 1) + /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset diff --git a/include/linux/efi.h b/include/linux/efi.h index b54686eb88d2820e6c8754a40512a2604073eb81..6f11a44ad1158afc9433a35587cb531d6d786c30 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -108,7 +108,8 @@ typedef struct { #define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 #define EFI_PAL_CODE 13 #define EFI_PERSISTENT_MEMORY 14 -#define EFI_MAX_MEMORY_TYPE 15 +#define EFI_UNACCEPTED_MEMORY 15 +#define EFI_MAX_MEMORY_TYPE 16 /* Attribute values: */ #define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ @@ -368,6 +369,8 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_ZBOOT_MEDIA_GUID EFI_GUID(0xe565a30d, 0x47da, 0x4dbd, 0xb3, 0x54, 0x9b, 0xb5, 0xc8, 0x4f, 0x8b, 0xe2) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) +#define LINUX_EFI_BOOT_MEMMAP_GUID EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) @@ -462,6 +465,23 @@ typedef union { efi_system_table_32_t mixed_mode; } efi_system_table_t; +struct efi_boot_memmap { + unsigned long map_size; + unsigned long desc_size; + u32 desc_ver; + unsigned long map_key; + unsigned long buff_size; + efi_memory_desc_t map[]; +}; + +struct efi_unaccepted_memory { + u32 version; + u32 unit_size; + u64 phys_base; + u64 size; + unsigned long bitmap[]; +}; + /* * Architecture independent structure for describing a memory map for the * benefit of efi_memmap_init_early(), and for passing context between @@ -560,6 +580,7 @@ extern struct efi { unsigned long tpm_final_log; /* TPM2 Final Events Log table */ unsigned long mokvar_table; /* MOK variable config table */ unsigned long coco_secret; /* Confidential computing secret table */ + unsigned long unaccepted; /* Unaccepted memory table */ efi_get_time_t *get_time; efi_set_time_t *set_time; diff --git a/include/linux/mm.h b/include/linux/mm.h index 1c48efef07b8c4e5d6b5093d032f72b96e751bb5..c0f0ad55b0a2678baeaecc94829debb0e8959246 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -222,6 +222,9 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) +/* to align the pointer to the (prev) page boundary */ +#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) + /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) @@ -3539,5 +3542,25 @@ static inline void fixup_vma(struct vm_area_struct *vma) async_fork_fixup_vma(vma); } +#ifdef CONFIG_UNACCEPTED_MEMORY + +bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); + +void accept_memory(phys_addr_t start, phys_addr_t end); + +#else + +static inline bool range_contains_unaccepted_memory(phys_addr_t start, + phys_addr_t end) +{ + return false; +} + +static inline void accept_memory(phys_addr_t start, phys_addr_t end) +{ +} + +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5938a17e119de0394520dc443e1e0b862692cb56..00f3cdee0202fbae334e4cfcdb10532ac6b8b5c0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -162,6 +162,9 @@ enum zone_stat_item { NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, +#ifdef CONFIG_UNACCEPTED_MEMORY + NR_UNACCEPTED, +#endif #ifdef CONFIG_PAGE_PREZERO NR_ZEROED_PAGES, /* Pre-zeroed pages */ #endif @@ -805,6 +808,11 @@ struct zone { /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; +#ifdef CONFIG_UNACCEPTED_MEMORY + /* Pages to be accepted. All pages on the list are MAX_ORDER */ + struct list_head unaccepted_pages; +#endif + /* zone flags, see below */ unsigned long flags; diff --git a/mm/memblock.c b/mm/memblock.c index fca9c5a1849ac4da0b5a08d5e9af975fd4890015..07aba80c8aae038d53070bd5f2d5e89d30a771db 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1374,6 +1374,15 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, */ kmemleak_alloc_phys(found, size, 0, 0); + /* + * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, + * require memory to be accepted before it can be used by the + * guest. + * + * Accept the memory of the allocated buffer. + */ + accept_memory(found, found + size); + return found; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd55dc3513ed4725d09050ae9dbb6247ec8042d8..307f83f585a504a9c2fbce0dc069e6e5fe7382c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -393,6 +393,12 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +static bool page_contains_unaccepted(struct page *page, unsigned int order); +static void accept_page(struct page *page, unsigned int order); +static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static inline bool has_unaccepted_memory(void); +static bool __free_unaccepted(struct page *page); + int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -1772,6 +1778,13 @@ void __free_pages_core(struct page *page, unsigned int order) atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + if (page_contains_unaccepted(page, order)) { + if (order == MAX_ORDER && __free_unaccepted(page)) + return; + + accept_page(page, order); + } + /* * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. @@ -1924,6 +1937,9 @@ static void __init deferred_free_range(unsigned long pfn, return; } + /* Accept chunks smaller than MAX_ORDER upfront */ + accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + for (i = 0; i < nr_pages; i++, page++, pfn++) { if ((pfn & (pageblock_nr_pages - 1)) == 0) set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -3928,6 +3944,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + unusable_free += zone_page_state(z, NR_UNACCEPTED); +#endif return unusable_free; } @@ -4227,6 +4246,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask)) { int ret; + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, order)) + goto try_this_zone; + } + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can @@ -4297,6 +4321,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, return page; } else { + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, order)) + goto try_this_zone; + } + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (static_branch_unlikely(&deferred_pages)) { @@ -6826,6 +6855,10 @@ static void __meminit zone_init_free_lists(struct zone *zone) zone->free_area[order].nr_zeroed = 0; #endif } + +#ifdef CONFIG_UNACCEPTED_MEMORY + INIT_LIST_HEAD(&zone->unaccepted_pages); +#endif } #if !defined(CONFIG_FLAT_NODE_MEM_MAP) @@ -9540,3 +9573,150 @@ bool has_managed_dma(void) return false; } #endif /* CONFIG_ZONE_DMA */ + +#ifdef CONFIG_UNACCEPTED_MEMORY + +/* Counts number of zones with unaccepted pages. */ +static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); + +static bool lazy_accept = true; + +static int __init accept_memory_parse(char *p) +{ + if (!strcmp(p, "lazy")) { + lazy_accept = true; + return 0; + } else if (!strcmp(p, "eager")) { + lazy_accept = false; + return 0; + } else { + return -EINVAL; + } +} +early_param("accept_memory", accept_memory_parse); + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ + phys_addr_t start = page_to_phys(page); + phys_addr_t end = start + (PAGE_SIZE << order); + + return range_contains_unaccepted_memory(start, end); +} + +static void accept_page(struct page *page, unsigned int order) +{ + phys_addr_t start = page_to_phys(page); + + accept_memory(start, start + (PAGE_SIZE << order)); +} + +static bool try_to_accept_memory_one(struct zone *zone) +{ + unsigned long flags; + struct page *page; + bool last; + + if (list_empty(&zone->unaccepted_pages)) + return false; + + spin_lock_irqsave(&zone->lock, flags); + page = list_first_entry_or_null(&zone->unaccepted_pages, + struct page, lru); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return false; + } + + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + + __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + + accept_page(page, MAX_ORDER); + + __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); + + if (last) + static_branch_dec(&zones_with_unaccepted_pages); + + return true; +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + long to_accept; + int ret = false; + + /* How much to accept to get to high watermark? */ + to_accept = high_wmark_pages(zone) - + (zone_page_state(zone, NR_FREE_PAGES) - + __zone_watermark_unusable_free(zone, order, 0)); + + /* Accept at least one page */ + do { + if (!try_to_accept_memory_one(zone)) + break; + ret = true; + to_accept -= MAX_ORDER_NR_PAGES; + } while (to_accept > 0); + + return ret; +} + +static inline bool has_unaccepted_memory(void) +{ + return static_branch_unlikely(&zones_with_unaccepted_pages); +} + +static bool __free_unaccepted(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + bool first = false; + + if (!lazy_accept) + return false; + + spin_lock_irqsave(&zone->lock, flags); + first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); + __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + + if (first) + static_branch_inc(&zones_with_unaccepted_pages); + + return true; +} + +#else + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ + return false; +} + +static void accept_page(struct page *page, unsigned int order) +{ +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + return false; +} + +static inline bool has_unaccepted_memory(void) +{ + return false; +} + +static bool __free_unaccepted(struct page *page) +{ + BUILD_BUG(); + return false; +} + +#endif /* CONFIG_UNACCEPTED_MEMORY */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 2957837fb3ae84f8ebe1bbc945b5ed894d9008f0..3e09dc13f566a9343a392f1ae517ec23367446dd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1162,6 +1162,9 @@ const char * const vmstat_text[] = { "nr_zspages", #endif "nr_free_cma", +#ifdef CONFIG_UNACCEPTED_MEMORY + "nr_unaccepted", +#endif #ifdef CONFIG_PAGE_PREZERO "nr_zeroed_pages", #endif