From 2d0418798a3c0d311ecad22720787ab83b1acfe7 Mon Sep 17 00:00:00 2001 From: Zhu Yanhai Date: Fri, 28 Oct 2022 15:25:13 +0800 Subject: [PATCH] anolis: mm: gup: allow to follow _PAGE_DEVMAP && !ZONE_DEVICE pages optionally ANBZ: #2586 Calling get_user_pages() on a range of user memory that has been mmaped from a DAX file will fail when there are no 'struct page' to describe those pages. This problem has been addressed in some device drivers by adding optional struct page support for pages under the control of the driver. To be specific, gup woud query pgmap_radix tree first and acquire a reference against a pgmap instance before it ever touches the page. This is to be sure the struct page won't be released under the nose of gup. And to add any memory into pgmap_radix, they mustn't be overlapped with the existing PFNs. However, there are cases that the users need to map the normal memory via DAX, and without losing the support of gup. Usually this could be done with a memmap parameter appended into the kernel cmdline to produce a fake PMEM device. Which is not flexible and sometimes unpractical (imagine how could you expose a BRD via DAX? A BRD device can't be stacked onto another PMEM device anyway). Given that what gup really cares is merely to make sure the existence of the page, and if the struct pages behind the DAX entries (_PAGE_DEVMAP entries) are actually from the normal memory, gup won't need to bother if they will be released on the fly. Therefore, it should be fine to fix the code like below, static inline bool pg_stable_pfn(unsigned long pfn) { int nid = pfn_to_nid(pfn); struct zone *zone_device = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; if (zone_spans_pfn(zone_device, pfn)) return false; else return true; } [and later in gup] page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { if (pg_stable_pfn(pte_pfn(pte))) page = pte_page(pte); else { /* * Only return device mapping pages in the FOLL_GET case * since they are only valid while holding the pgmap * reference. */ pgmap = get_dev_pagemap(pte_pfn(pte), NULL); if (pgmap) page = pte_page(pte); else goto no_page; } [cut here] Nevertheless, it's too risky to expose the pages of this kind to the other parts of the world at once. So this patch proposes a FOLL_GET_PGSTABLE flag associated with FOLL_GET. The purpose is to let the selected users of gup (it's KVM in this patch) take the shot first, while left the others as what they were. Signed-off-by: Zhu Yanhai Signed-off-by: Simon Guo --- include/linux/mm.h | 1 + mm/gup.c | 38 ++++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 3 ++- 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8300efb66995..94ae81a9ac78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2947,6 +2947,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_GET_PGSTABLE 0x80000000 /* do get_page on page not in ZONE_DEVICE */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/mm/gup.c b/mm/gup.c index b8e68117a609..93380576cf79 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -392,6 +392,28 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } +#ifdef CONFIG_ZONE_DEVICE +static inline bool pg_stable_pfn(unsigned long pfn) +{ + int nid = pfn_to_nid(pfn); + struct zone *zone_device = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + + /* + * The other zones' pages are not kept alive by pgmap + * but memory hotplug instead. + */ + if (zone_spans_pfn(zone_device, pfn)) + return false; + else + return true; +} +#else +static inline bool pg_stable_pfn(unsigned long pfn) +{ + return false; +} +#endif + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) @@ -445,6 +467,21 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { + /* + * The pages which are not in ZONE_DEVICE don't depend on + * the pgmap ref to keep alive (stable). Therefore if we are + * looking at such a page and the caller explicitly claimed that + * he doesn't really care whether this page is from ZONE_DEVICE + * or not - either a page from ZONE_NORMAL ZONE_MOVABLE etc or + * a page from ZONE_DEVICE are both fine - we could simply + * get_page for those stable ones straightly. + */ + if ((flags & FOLL_GET_PGSTABLE) && + pg_stable_pfn(pte_pfn(pte))) { + page = pte_page(pte); + goto page_required; + } + /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap @@ -471,6 +508,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, } } +page_required: if (flags & FOLL_SPLIT && PageTransCompound(page)) { get_page(page); pte_unmap_unlock(ptep, ptl); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6b44ffc1521..85b0bf834811 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1867,7 +1867,8 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, bool *writable, kvm_pfn_t *pfn) { - unsigned int flags = FOLL_HWPOISON; + /* The purpose of calling GUP below is only to get a reference. */ + unsigned int flags = FOLL_HWPOISON | FOLL_GET_PGSTABLE; struct page *page; int npages = 0; -- Gitee