diff --git a/include/linux/mm.h b/include/linux/mm.h index 886569d56066fe4611479b5b21fb2ed1cd400f22..1f36bf9ee02f7e9ae1a3d9cc8e16666d0260d663 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4063,6 +4063,7 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -4077,6 +4078,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_FREE_DPOOL, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 9dbca0e03b3d8dd4893552900948d8396d291cb3..77a7f2a80a973571aa780b640b2859033a3b6846 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -414,6 +414,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ @@ -428,6 +429,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_FREE_DPOOL, "free dynamic pool page" ) \ + EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/madvise.c b/mm/madvise.c index b3a1500decca373a8b4c3a550f62612d8c55ecd3..3b63ab74c8a570e634a3acc028084766e31f923a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1215,7 +1215,7 @@ static int madvise_inject_error(int behavior, } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); + ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7f7b756118692ab17f38c3e952ce358661e39c3a..b0733374b017d8985fe6586bfc29eb3cfc12e06b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -517,22 +517,15 @@ void add_to_kill_ksm(struct task_struct *tsk, struct page *p, * * Only do anything when FORCEKILL is set, otherwise just free the * list (this is used for clean pages which do not need killing) - * Also when FAIL is set do a force kill because something went - * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, +static void kill_procs(struct list_head *to_kill, int forcekill, unsigned long pfn, int flags) { struct to_kill *tk, *next; list_for_each_entry_safe(tk, next, to_kill, nd) { if (forcekill) { - /* - * In case something went wrong with munmapping - * make sure the process doesn't catch the - * signal and then access the memory. Just kill it. - */ - if (fail || tk->addr == -EFAULT) { + if (tk->addr == -EFAULT) { pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); do_send_sig_info(SIGKILL, SEND_SIG_PRIV, @@ -878,6 +871,28 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, return ret > 0 ? -EHWPOISON : -EFAULT; } +/* + * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * But it could not do more to isolate the page from being accessed again, + * nor does it kill the process. This is extremely rare and one of the + * potential causes is that the page state has been changed due to + * underlying race condition. This is the most severe outcomes. + * + * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * It should have killed the process, but it can't isolate the page, + * due to conditions such as extra pin, unmap failure, etc. Accessing + * the page again may trigger another MCE and the process will be killed + * by the m-f() handler immediately. + * + * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * The page is unmapped, and is removed from the LRU or file mapping. + * An attempt to access the page again will trigger page fault and the + * PF handler will kill the process. + * + * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * The page has been completely isolated, that is, unmapped, taken out of + * the buddy system, or hole-punnched out of the file mapping. + */ static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -892,6 +907,7 @@ static const char * const action_page_types[] = { [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_GET_HWPOISON] = "get hwpoison page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -906,6 +922,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_FREE_DPOOL] = "free dynamic pool page", + [MF_MSG_ALREADY_POISONED] = "already poisoned", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1014,12 +1031,13 @@ static int me_kernel(struct page_state *ps, struct page *p) /* * Page in unknown state. Do nothing. + * This is a catch-all in case we fail to make sense of the page state. */ static int me_unknown(struct page_state *ps, struct page *p) { pr_err("%#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); - return MF_FAILED; + return MF_IGNORED; } /* @@ -1661,7 +1679,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || !unmap_success; - kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); + kill_procs(&tokill, forcekill, pfn, flags); return unmap_success; } @@ -1689,7 +1707,12 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } -static int try_to_split_thp_page(struct page *page) +/* + * When 'release' is 'false', it means that if thp split has failed, + * there is still more to do, hence the page refcount we took earlier + * is still needed. + */ +static int try_to_split_thp_page(struct page *page, bool release) { int ret; @@ -1697,7 +1720,7 @@ static int try_to_split_thp_page(struct page *page) ret = split_huge_page(page); unlock_page(page); - if (unlikely(ret)) + if (ret && release) put_page(page); return ret; @@ -1725,7 +1748,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, unmap_mapping_range(mapping, start, size, 0); } - kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); + kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags); } /* @@ -2059,6 +2082,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } return res; } else if (res == -EBUSY) { @@ -2066,7 +2090,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb flags |= MF_NO_RETRY; goto retry; } - return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); } folio = page_folio(p); @@ -2103,7 +2127,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { folio_unlock(folio); - return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED); } return identify_page_state(pfn, p, page_flags); @@ -2166,6 +2190,22 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, return rc; } +/* + * The calling condition is as such: thp split failed, page might have + * been RDMA pinned, not much can be done for recovery. + * But a SIGBUS should be delivered with vaddr provided so that the user + * application has a chance to recover. Also, application processes' + * election for MCE early killed will be honored. + */ +static void kill_procs_now(struct page *p, unsigned long pfn, int flags, + struct folio *folio) +{ + LIST_HEAD(tokill); + + collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); + kill_procs(&tokill, true, pfn, flags); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2240,6 +2280,7 @@ int memory_failure(unsigned long pfn, int flags) res = kill_accessing_process(current, pfn, flags); if (flags & MF_COUNT_INCREASED) put_page(p); + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); goto unlock_mutex; } @@ -2280,11 +2321,22 @@ int memory_failure(unsigned long pfn, int flags) } goto unlock_mutex; } else if (res < 0) { - res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); goto unlock_mutex; } } + /* filter pages that are protected from hwpoison test by users */ + lock_page(p); + if (hwpoison_filter(p)) { + ClearPageHWPoison(p); + unlock_page(p); + put_page(p); + res = -EOPNOTSUPP; + goto unlock_mutex; + } + unlock_page(p); + hpage = compound_head(p); if (PageTransHuge(hpage)) { /* @@ -2301,8 +2353,11 @@ int memory_failure(unsigned long pfn, int flags) * page is a valid handlable page. */ SetPageHasHWPoisoned(hpage); - if (try_to_split_thp_page(p) < 0) { - res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + if (try_to_split_thp_page(p, false) < 0) { + res = -EHWPOISON; + kill_procs_now(p, pfn, flags, page_folio(hpage)); + put_page(p); + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_FAILED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); @@ -2348,14 +2403,6 @@ int memory_failure(unsigned long pfn, int flags) */ page_flags = p->flags; - if (hwpoison_filter(p)) { - ClearPageHWPoison(p); - unlock_page(p); - put_page(p); - res = -EOPNOTSUPP; - goto unlock_mutex; - } - /* * __munlock_folio() may clear a writeback page's LRU flag without * page_lock. We need wait writeback completion for this page or it @@ -2375,7 +2422,7 @@ int memory_failure(unsigned long pfn, int flags) * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ if (!hwpoison_user_mappings(p, pfn, flags, p)) { - res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED); goto unlock_page; } @@ -2703,7 +2750,7 @@ static int soft_offline_in_use_page(struct page *page) }; if (!huge && folio_test_large(folio)) { - if (try_to_split_thp_page(page)) { + if (try_to_split_thp_page(page, true)) { pr_info("soft offline: %#lx: thp split failed\n", pfn); return -EBUSY; }