diff --git a/include/linux/mm.h b/include/linux/mm.h index 8300efb6699580abaaf67021862a49cdfa74cc78..94ae81a9ac78c97e37d77d89e5e50ca886ecdcd3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2947,6 +2947,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_GET_PGSTABLE 0x80000000 /* do get_page on page not in ZONE_DEVICE */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/mm/gup.c b/mm/gup.c index b8e68117a609cdb77ef1edb810ba1dc24441da00..93380576cf7953ba4a96a2a044da91adda6f7b26 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -392,6 +392,28 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } +#ifdef CONFIG_ZONE_DEVICE +static inline bool pg_stable_pfn(unsigned long pfn) +{ + int nid = pfn_to_nid(pfn); + struct zone *zone_device = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + + /* + * The other zones' pages are not kept alive by pgmap + * but memory hotplug instead. + */ + if (zone_spans_pfn(zone_device, pfn)) + return false; + else + return true; +} +#else +static inline bool pg_stable_pfn(unsigned long pfn) +{ + return false; +} +#endif + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) @@ -445,6 +467,21 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { + /* + * The pages which are not in ZONE_DEVICE don't depend on + * the pgmap ref to keep alive (stable). Therefore if we are + * looking at such a page and the caller explicitly claimed that + * he doesn't really care whether this page is from ZONE_DEVICE + * or not - either a page from ZONE_NORMAL ZONE_MOVABLE etc or + * a page from ZONE_DEVICE are both fine - we could simply + * get_page for those stable ones straightly. + */ + if ((flags & FOLL_GET_PGSTABLE) && + pg_stable_pfn(pte_pfn(pte))) { + page = pte_page(pte); + goto page_required; + } + /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap @@ -471,6 +508,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, } } +page_required: if (flags & FOLL_SPLIT && PageTransCompound(page)) { get_page(page); pte_unmap_unlock(ptep, ptl); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6b44ffc1521f880bf0acc83e2387a9b1ed12e03..85b0bf83481158fa5491f3ad8f94fb2d32d797a2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1867,7 +1867,8 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, bool *writable, kvm_pfn_t *pfn) { - unsigned int flags = FOLL_HWPOISON; + /* The purpose of calling GUP below is only to get a reference. */ + unsigned int flags = FOLL_HWPOISON | FOLL_GET_PGSTABLE; struct page *page; int npages = 0;