diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index c2a236f2e84608b73ab42f80cec52c62a1fe33e9..0a98685b5075aa199f46963210103ddc44d7073f 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -29,6 +29,16 @@ menuconfig RAS so have ideal availability, but may be unreliable, with frequent data corruption. +config PAGE_EJECT + tristate "page eject" + default m + depends on MEMORY_FAILURE + help + Used to eject page, which is achieved by soft_offline_page and + unpoison_memory. A linked list is maintained to log the pfns + which are offlined by this module. Only the pfns present in the + list is allowed to go online. + if RAS source "arch/x86/ras/Kconfig" diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 6f0404f501071b37200119fe7d4f90d3c73cf49a..ba551a2403c325895a3d3c011e36027c86d861e8 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o +obj-$(CONFIG_PAGE_EJECT) += page_eject.o diff --git a/drivers/ras/page_eject.c b/drivers/ras/page_eject.c new file mode 100644 index 0000000000000000000000000000000000000000..62a5dbd0fa0a3f92bfecf5ea1667ac233205001a --- /dev/null +++ b/drivers/ras/page_eject.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#define pr_fmt(fmt) "page eject: " fmt + +#include +#include +#include + +static struct list_head eject_page_list = LIST_HEAD_INIT(eject_page_list); +static DEFINE_MUTEX(eject_page_mutex); +static struct kobject *eject_page_kobj; + +struct ejected_pfn { + struct list_head list; + unsigned long pfn; +}; + +static struct ejected_pfn *page_eject_remove_pfn_locked(unsigned long pfn) +{ + struct ejected_pfn *item = NULL, *next; + + mutex_lock(&eject_page_mutex); + list_for_each_entry_safe(item, next, &eject_page_list, list) { + if (pfn == item->pfn) + break; + } + if (item) + list_del(&item->list); + mutex_unlock(&eject_page_mutex); + + return item; +} + +static void page_eject_add_pfn_locked(struct ejected_pfn *item) +{ + mutex_lock(&eject_page_mutex); + list_add_tail(&item->list, &eject_page_list); + mutex_unlock(&eject_page_mutex); +} + +static void page_eject_clear_list_locked(void) +{ + struct ejected_pfn *item, *next; + + mutex_lock(&eject_page_mutex); + list_for_each_entry_safe(item, next, &eject_page_list, list) { + list_del(&item->list); + kfree(item); + } + mutex_unlock(&eject_page_mutex); +} + +static int page_eject_offline_page(unsigned long pfn) +{ + struct ejected_pfn *item; + struct page *page; + int ret; + + page = pfn_to_online_page(pfn); + if (!page) + return -EINVAL; + + if (PageHWPoison(page)) { + pr_err("page fail to be offlined, page is already offlined, pfn: %#lx\n", pfn); + return -EINVAL; + } + + current->flags |= PF_MCS; + /* + * if soft_offline_page return 0 because PageHWPoison, this pfn + * will add to list and this add will be removed during online + * since it is poisoned. + */ + ret = soft_offline_page(pfn, 0); + if (ret) { + pr_err("page fail to be offlined, soft_offline_page failed(%d), pfn=%#lx\n", + ret, pfn); + return ret; + } + current->flags &= ~PF_MCS; + + item = kzalloc(sizeof(struct ejected_pfn), GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->pfn = pfn; + + page_eject_add_pfn_locked(item); + + return 0; +} + +static int page_eject_online_page(unsigned long pfn) +{ + struct ejected_pfn *item; + struct page *page; + int ret; + + page = pfn_to_online_page(pfn); + if (!page) + return -EINVAL; + + item = page_eject_remove_pfn_locked(pfn); + if (!item) { + pr_err("page failed to be onlined, pfn: %#lx\n", pfn); + return -EINVAL; + } + + ret = unpoison_memory(pfn); + if (!ret) { + kfree(item); + return ret; + } + + /* re-add pfn to list if unpoison failed */ + page_eject_add_pfn_locked(item); + pr_err("page failed to be onlined, unpoison_memory error(%d), pfn: %#lx\n", + ret, pfn); + return ret; +} + +static int page_eject_remove_page(unsigned long pfn) +{ + struct ejected_pfn *item; + + item = page_eject_remove_pfn_locked(pfn); + if (!item) { + pr_info("page fail to be removed, pfn: %#lx\n", pfn); + return -EINVAL; + } + + kfree(item); + + return 0; +} + +static ssize_t offline_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + u64 paddr; + int res; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoull(buf, 16, &paddr)) + return -EINVAL; + + res = page_eject_offline_page(paddr >> PAGE_SHIFT); + if (res) + return res; + + return count; +} + +static ssize_t online_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + u64 paddr; + int res; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoull(buf, 16, &paddr)) + return -EINVAL; + + res = page_eject_online_page(paddr >> PAGE_SHIFT); + if (res) + return res; + + return count; +} + +static ssize_t remove_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + u64 paddr; + int res; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoull(buf, 16, &paddr)) + return -EINVAL; + + res = page_eject_remove_page(paddr >> PAGE_SHIFT); + if (res) + return res; + + return count; +} + +static struct kobj_attribute online_attr = + __ATTR(online_page, 0200, NULL, online_store); +static struct kobj_attribute offline_attr = + __ATTR(offline_page, 0200, NULL, offline_store); +static struct kobj_attribute remove_attr = + __ATTR(remove_page, 0200, NULL, remove_store); + +static struct attribute *eject_page_attrs[] = { + &offline_attr.attr, + &online_attr.attr, + &remove_attr.attr, + NULL, +}; + +static struct attribute_group eject_page_attr_group = { + .attrs = eject_page_attrs, +}; + +static int __init page_eject_init(void) +{ + int ret = -EINVAL; + + eject_page_kobj = kobject_create_and_add("page_eject", kernel_kobj); + if (!eject_page_kobj) + return ret; + + ret = sysfs_create_group(eject_page_kobj, &eject_page_attr_group); + if (ret) { + kobject_put(eject_page_kobj); + return ret; + } + + mutex_init(&eject_page_mutex); + + pr_info("init page eject succeed\n"); + return ret; +} + +static void __exit page_eject_exit(void) +{ + page_eject_clear_list_locked(); + + kobject_put(eject_page_kobj); + + pr_info("exit page eject succeed\n"); +} + +module_init(page_eject_init); +module_exit(page_eject_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ma Wupeng "); +MODULE_DESCRIPTION("page eject"); diff --git a/fs/coredump.c b/fs/coredump.c index 6c0d0a42fda940628d527c7080c2d2eef3477fc4..535c3fdc159849a04a1e04a0f0aa706c96863a0d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -907,9 +907,9 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, if (page) { void *kaddr = kmap(page); - current->flags |= PF_COREDUMP_MCS; + current->flags |= PF_MCS; stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - current->flags &= ~PF_COREDUMP_MCS; + current->flags &= ~PF_MCS; kunmap(page); put_page(page); } else { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index cc5fe6c620adcfb1ef5fd6e8e8a743ae8d3cc070..f86f8898941b6b7195c4e3cfcd1d01252e42de52 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -396,4 +396,53 @@ static inline void memcpy_to_page(struct page *page, size_t offset, kunmap_atomic(to); } +#ifdef copy_mc_to_kernel +/* + * If architecture supports machine check exception handling, define the + * #MC versions of copy_user_highpage and copy_highpage. They copy a memory + * page with #MC in source page (@from) handled, and return the number + * of bytes not copied if there was a #MC, otherwise 0 for success. + */ +static inline int copy_mc_highpage(struct page *to, struct page *from) +{ + char *vfrom, *vto; + int ret; + + vfrom = kmap_atomic(from); + vto = kmap_atomic(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + + return ret; +} + +static inline int copy_mc_highpages(struct page *to, struct page *from, int nr_pages) +{ + int ret = 0; + int i; + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + ret = copy_mc_highpage(to + i, from + i); + if (!ret) + return ret; + } + + return ret; +} +#else +static inline int copy_mc_highpage(struct page *to, struct page *from) +{ + copy_highpage(to, from); + return 0; +} + +static inline int copy_mc_highpages(struct page *to, struct page *from, int nr_pages) +{ + copy_highpages(to, from, nr_pages); + return 0; +} +#endif + #endif /* _LINUX_HIGHMEM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d39427f8044d3a5a4b57f4ee9e7aefbe82b3c4a5..5652309e99ead520dbb5dec0654617ceea46d981 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1648,7 +1648,7 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_COREDUMP_MCS 0x01000000 /* Task coredump support machine check safe */ +#define PF_MCS 0x01000000 /* Mc is support for specific function(eg. coredump) for this task */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 0a4b7aa47097b3a53e50f286d81c9d61a68e2c42..ce8c225237f56e9e160fe2524f44c9fcd7371401 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -752,7 +752,7 @@ EXPORT_SYMBOL_GPL(_copy_mc_to_iter); static void *memcpy_iter(void *to, const void *from, __kernel_size_t size) { - if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && current->flags & PF_COREDUMP_MCS) + if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && current->flags & PF_MCS) return (void *)copy_mc_to_kernel(to, from, size); else return memcpy(to, from, size); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 406895b98422a660059238cb5d58dad19caf83f0..b87ba13fd56b522b306ef331e5f9d2d5e0726d4b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2184,3 +2184,4 @@ int soft_offline_page(unsigned long pfn, int flags) return ret; } +EXPORT_SYMBOL_GPL(soft_offline_page); diff --git a/mm/migrate.c b/mm/migrate.c index 646918708922f03f5f265898ba621379cf8b2268..7f036a63473b17526ae1111ff713960339e75e41 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -548,26 +548,35 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, * arithmetic will work across the entire page. We need something more * specialized. */ -static void __copy_gigantic_page(struct page *dst, struct page *src, - int nr_pages) +static int __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages, bool mc) { - int i; + int i, ret = 0; struct page *dst_base = dst; struct page *src_base = src; for (i = 0; i < nr_pages; ) { cond_resched(); - copy_highpage(dst, src); + if (!mc) { + copy_highpage(dst, src); + } else { + ret = copy_mc_highpage(dst, src); + if (!ret) + return ret; + } i++; dst = mem_map_next(dst, dst_base, i); src = mem_map_next(src, src_base, i); } + + return ret; } -static void copy_huge_page(struct page *dst, struct page *src) +static int copy_huge_page(struct page *dst, struct page *src, bool mc) { int nr_pages; + int ret = 0; if (PageHuge(src)) { /* hugetlbfs page */ @@ -575,8 +584,8 @@ static void copy_huge_page(struct page *dst, struct page *src) nr_pages = pages_per_huge_page(h); if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { - __copy_gigantic_page(dst, src, nr_pages); - return; + ret = __copy_gigantic_page(dst, src, nr_pages, mc); + return ret; } } else { /* thp page */ @@ -584,7 +593,12 @@ static void copy_huge_page(struct page *dst, struct page *src) nr_pages = thp_nr_pages(src); } - copy_highpages(dst, src, nr_pages); + if (!mc) + copy_highpages(dst, src, nr_pages); + else + ret = copy_mc_highpages(dst, src, nr_pages); + + return ret; } /* @@ -666,7 +680,7 @@ EXPORT_SYMBOL(migrate_page_states); void migrate_page_copy(struct page *newpage, struct page *page) { if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); + copy_huge_page(newpage, page, false); else copy_highpage(newpage, page); @@ -674,6 +688,23 @@ void migrate_page_copy(struct page *newpage, struct page *page) } EXPORT_SYMBOL(migrate_page_copy); +int migrate_page_copy_mc(struct page *newpage, struct page *page) +{ + int rc; + + if (PageHuge(page) || PageTransHuge(page)) + rc = copy_huge_page(newpage, page, true); + else + rc = copy_mc_highpage(newpage, page); + + if (rc) + return rc; + + migrate_page_states(newpage, page); + + return 0; +} + /************************************************************ * Migration functions ***********************************************************/ @@ -691,10 +722,18 @@ int migrate_page_extra(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else + if (mode != MIGRATE_SYNC_NO_COPY) { + if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && + current->flags & PF_MCS) { + rc = migrate_page_copy_mc(newpage, page); + if (!rc) + return -EFAULT; + } else { + migrate_page_copy(newpage, page); + } + } else { migrate_page_states(newpage, page); + } return MIGRATEPAGE_SUCCESS; }