diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy new file mode 100644 index 0000000000000000000000000000000000000000..8ac327fd7fb6e3ffedfc35c13324e41f5c12b7e3 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy @@ -0,0 +1,4 @@ +What: /sys/kernel/mm/mempolicy/ +Date: January 2024 +Contact: Linux memory management mailing list +Description: Interface for Mempolicy diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave new file mode 100644 index 0000000000000000000000000000000000000000..0b7972de04e9392b0a56bbb5d1e0a8b963fa3d6f --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave @@ -0,0 +1,25 @@ +What: /sys/kernel/mm/mempolicy/weighted_interleave/ +Date: January 2024 +Contact: Linux memory management mailing list +Description: Configuration Interface for the Weighted Interleave policy + +What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN +Date: January 2024 +Contact: Linux memory management mailing list +Description: Weight configuration interface for nodeN + + The interleave weight for a memory node (N). These weights are + utilized by tasks which have set their mempolicy to + MPOL_WEIGHTED_INTERLEAVE. + + These weights only affect new allocations, and changes at runtime + will not cause migrations on already allocated pages. + + The minimum weight for a node is always 1. + + Minimum weight: 1 + Maximum weight: 255 + + Writing an empty string or `0` will reset the weight to the + system default. The system default may be set by the kernel + or drivers at boot or during hotplug events. diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index eca38fa81e0f98521a2963cec536de98bc2297ef..a70f20ce1ffb4ffd0cdcb8e575dce5547c76e73a 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY can fall back to all existing numa nodes. This is effectively MPOL_PREFERRED allowed for a mask rather than a single node. +MPOL_WEIGHTED_INTERLEAVE + This mode operates the same as MPOL_INTERLEAVE, except that + interleaving behavior is executed based on weights set in + /sys/kernel/mm/mempolicy/weighted_interleave/ + + Weighted interleave allocates pages on nodes according to a + weight. For example if nodes [0,1] are weighted [5,2], 5 pages + will be allocated on node0 for every 2 pages allocated on node1. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 720000d83c53f9e37aeb54acad9c7f3aefb59f51..9107151b6cfa7c0d34751c3affe986a7240fb9a8 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -433,60 +433,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, return ret; } -#ifdef CONFIG_NUMA -static int kernfs_vma_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - kernfs_put_active(of->kn); - return ret; -} - -static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, - unsigned long addr, pgoff_t *ilx) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!kernfs_get_active(of->kn)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr, NULL); - - kernfs_put_active(of->kn); - return pol; -} - -#endif - static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, -#ifdef CONFIG_NUMA - .set_policy = kernfs_vma_set_policy, - .get_policy = kernfs_vma_get_policy, -#endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a691365061c1704ebdbbb2e5fbd08c99b5f1d36..9abba120c317e269ce6d38bc1d99e7c7503bdb2d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1959,8 +1959,9 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -1969,7 +1970,7 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fbad47b3f5e87f18895de12508ad00766ea00cdf..558c3f6bee08a3d97e0d2a822693333e9957d360 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,7 @@ #include struct vm_area_struct; +struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -295,7 +296,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); -struct folio *folio_alloc(gfp_t gfp, unsigned order); +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); +struct folio *folio_alloc(gfp_t gfp, unsigned int order); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -303,6 +306,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return alloc_pages(gfp, order); +} static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 93da3db9d7ce9e4f60f17ad12a04f46d70f85571..d77fac986e0d519b0a6bd5ea1e2ab6936bbf8d1a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -796,8 +796,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -1137,13 +1135,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, - struct vm_area_struct *vma, - unsigned long address) -{ - return NULL; -} - static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 26b13910d60bf7a14623352a1bff9f710a2c9c17..5cb841943283048c5af82946ade8018b0591eced 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -17,6 +17,8 @@ struct mm_struct; +#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ + #ifdef CONFIG_NUMA /* @@ -110,35 +112,30 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) /* * Tree of shared policies for a shared memory region. - * Maintain the policies in a pseudo mm that contains vmas. The vmas - * carry the policy. As a special twist the pseudo mm is indexed in pages, not - * bytes, so that we can work with shared memory segments bigger than - * unsigned long. */ - -struct sp_node { - struct rb_node nd; - unsigned long start, end; - struct mempolicy *policy; -}; - struct shared_policy { struct rb_root root; rwlock_t lock; }; +struct sp_node { + struct rb_node nd; + pgoff_t start, end; + struct mempolicy *policy; +}; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, - struct mempolicy *new); -void mpol_free_shared_policy(struct shared_policy *p); +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *mpol); +void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, - unsigned long idx); + pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); @@ -152,8 +149,6 @@ extern int huge_node(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -196,12 +191,17 @@ extern long __do_mbind(unsigned long start, unsigned long len, struct mempolicy {}; +static inline struct mempolicy *get_task_policy(struct task_struct *p) +{ + return NULL; +} + static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } -static inline void mpol_put(struct mempolicy *p) +static inline void mpol_put(struct mempolicy *pol) { } @@ -220,18 +220,25 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp, { } -static inline void mpol_free_shared_policy(struct shared_policy *p) +static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } #define vma_policy(vma) NULL +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} + static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8494de333376224ab6813cdbacef5126d4a714d2..1f9bb10d1a473f553f328d5b5a6747c687b7931f 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -49,7 +49,7 @@ enum { #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to policy */ #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ -#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_LAZY (1<<3) /* UNSUPPORTED FLAG: Lazy migrate on fault */ #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ #define MPOL_MF_VALID (MPOL_MF_STRICT | \ diff --git a/ipc/shm.c b/ipc/shm.c index fdc248663b2b0faba39563aa1b73e9b3b810520c..bb017dd760f4d997120f20993d94e94b2884a40f 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -565,30 +565,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) } #ifdef CONFIG_NUMA -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); + struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) - err = sfd->vm_ops->set_policy(vma, new); + err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); - struct mempolicy *pol = NULL; + struct shm_file_data *sfd = shm_file_data(vma->vm_file); + struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) - pol = sfd->vm_ops->get_policy(vma, addr, NULL); - else if (vma->vm_policy) - pol = vma->vm_policy; - - return pol; + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); + return mpol; } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1cd1196e0d66fba77811cf211c9e4f22c9d04b73..2838f14509c79f2bbe66612eef82b9f278e27c62 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2605,24 +2605,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } -/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - static nodemask_t *policy_mbind_nodemask(gfp_t gfp) { #ifdef CONFIG_NUMA @@ -6603,6 +6585,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } #ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8d7732e276f3fb48b6b651d7c665decfa947276e..10432b8cc835e05d4461de37c8be2a4ae0a54b82 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -19,13 +19,20 @@ * for anonymous memory. For process policy an process counter * is used. * + * weighted interleave + * Allocate memory interleaved over a set of nodes based on + * a set of weights (per-node), with normal fallback if it + * fails. Otherwise operates the same as interleave. + * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated + * on node 0 for every 1 page allocated on node 1. + * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * - * preferred Try a specific node first before normal fallback. + * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default @@ -52,7 +59,7 @@ * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * - * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ @@ -113,7 +120,8 @@ /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ -#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -132,6 +140,32 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/* + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes + * system-default value should be used. A NULL iw_table also denotes that + * system-default values should be used. Until the system-default table + * is implemented, the system-default is always 1. + * + * iw_table is RCU protected + */ +static u8 __rcu *iw_table; +static DEFINE_MUTEX(iw_table_lock); + +static u8 get_il_weight(int node) +{ + u8 *table; + u8 weight; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* if no iw_table, use system default */ + weight = table ? table[node] : 1; + /* if value in iw_table is 0, use system default */ + weight = weight ? weight : 1; + rcu_read_unlock(); + return weight; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -269,9 +303,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, { struct mempolicy *policy; - pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); - if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -299,6 +330,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); @@ -311,11 +343,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, } /* Slow path of a mpol destructor. */ -void __mpol_put(struct mempolicy *p) +void __mpol_put(struct mempolicy *pol) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, p); + kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) @@ -372,7 +404,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) * * Called with task's alloc_lock held. */ - void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); @@ -383,7 +414,6 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ - void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; @@ -420,10 +450,31 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, }; -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); + +static bool strictly_unmovable(unsigned long flags) +{ + /* + * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO + * if any misplaced page is found. + */ + return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == + MPOL_MF_STRICT; +} + +struct migration_mpol { /* for alloc_migration_target_by_mpol() */ + struct mempolicy *pol; + pgoff_t ilx; +}; struct queue_pages { struct list_head *pagelist; @@ -432,7 +483,8 @@ struct queue_pages { unsigned long start; unsigned long end; struct vm_area_struct *first; - bool has_unmovable; + struct folio *large; /* note last large folio encountered */ + long nr_failed; /* could not be isolated at this time */ }; /* @@ -450,61 +502,37 @@ static inline bool queue_folio_required(struct folio *folio, return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } -/* - * queue_folios_pmd() has three possible return values: - * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page, or unmovable page is found - * but continue walking (indicated by queue_pages.has_unmovable). - * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing folio was already on a node that does not follow the - * policy. - */ -static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, - unsigned long end, struct mm_walk *walk) - __releases(ptl) +static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { - int ret = 0; struct folio *folio; struct queue_pages *qp = walk->private; - unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = -EIO; - goto unlock; + qp->nr_failed++; + return; } folio = pmd_folio(*pmd); if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; - goto unlock; + return; } if (!queue_folio_required(folio, qp)) - goto unlock; - - flags = qp->flags; - /* go to folio migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - if (!vma_migratable(walk->vma) || - migrate_folio_add(folio, qp->pagelist, flags)) { - qp->has_unmovable = true; - goto unlock; - } - } else - ret = -EIO; -unlock: - spin_unlock(ptl); - return ret; + return; + if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma) || + !migrate_folio_add(folio, qp->pagelist, qp->flags)) + qp->nr_failed++; } /* - * Scan through pages checking if pages follow certain conditions, - * and move them to the pagelist if they do. + * Scan through folios, checking if they satisfy the required conditions, + * moving them from LRU to local pagelist for migration if they do (or not). * - * queue_folios_pte_range() has three possible return values: - * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page, or unmovable page is found - * but continue walking (indicated by queue_pages.has_unmovable). - * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already - * on a node that does not follow the policy. + * queue_folios_pte_range() has two possible return values: + * 0 - continue walking to scan for more, even if an existing folio on the + * wrong node could not be isolated and queued for migration. + * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, + * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -520,8 +548,11 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) - return queue_folios_pmd(pmd, ptl, addr, end, walk); + if (ptl) { + queue_folios_pmd(pmd, walk); + spin_unlock(ptl); + goto out; + } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { @@ -532,8 +563,13 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, max_nr = (end - addr) >> PAGE_SHIFT; nr = 1; ptent = ptep_get(pte); - if (!pte_present(ptent)) + if (pte_none(ptent)) continue; + if (!pte_present(ptent)) { + if (is_migration_entry(pte_to_swp_entry(ptent))) + qp->nr_failed++; + continue; + } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; @@ -549,94 +585,86 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_folio_required(folio, qp)) continue; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (folio_test_large(folio)) { /* - * MPOL_MF_STRICT must be specified if we get here. - * Continue walking vmas due to MPOL_MF_MOVE* flags. - */ - if (!vma_migratable(vma)) - qp->has_unmovable = true; - - /* - * Do not abort immediately since there may be - * temporary off LRU pages in the range. Still - * need migrate other LRU pages. + * A large folio can only be isolated from LRU once, + * but may be mapped by many PTEs (and Copy-On-Write may + * intersperse PTEs of other, order 0, folios). This is + * a common case, so don't mistake it for failure (but + * there can be other cases of multi-mapped pages which + * this quick check does not help to filter out - and a + * search of the pagelist might grow to be prohibitive). + * + * migrate_pages(&pagelist) returns nr_failed folios, so + * check "large" now so that queue_pages_range() returns + * a comparable nr_failed folios. This does imply that + * if folio could not be isolated for some racy reason + * at its first PTE, later PTEs will not give it another + * chance of isolation; but keeps the accounting simple. */ - if (migrate_folio_add(folio, qp->pagelist, flags)) - qp->has_unmovable = true; - } else - break; + if (folio == qp->large) + continue; + qp->large = folio; + } + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(vma) || + !migrate_folio_add(folio, qp->pagelist, flags)) { + qp->nr_failed++; + if (strictly_unmovable(flags)) + break; + } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); - - return addr != end ? -EIO : 0; +out: + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; + return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; - unsigned long flags = (qp->flags & MPOL_MF_VALID); + unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); - if (!pte_present(entry)) + if (!pte_present(entry)) { + if (unlikely(is_hugetlb_entry_migration(entry))) + qp->nr_failed++; goto unlock; + } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; - - if (flags == MPOL_MF_STRICT) { - /* - * STRICT alone means only detecting misplaced folio and no - * need to further check other vma. - */ - ret = -EIO; + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma)) { + qp->nr_failed++; goto unlock; } - - if (!vma_migratable(walk->vma)) { - /* - * Must be STRICT with MOVE*, otherwise .test_walk() have - * stopped walking current vma. - * Detecting misplaced folio but allow migrating folios which - * have been queued. - */ - qp->has_unmovable = true; - goto unlock; - } - /* - * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it - * is shared it is likely not worth migrating. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_likely_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ - if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && !folio_likely_mapped_shared(folio) && - !hugetlb_pmd_shared(pte))) { - if (!isolate_hugetlb(folio, qp->pagelist) && - (flags & MPOL_MF_STRICT)) - /* - * Failed to isolate folio but allow migrating pages - * which have been queued. - */ - qp->has_unmovable = true; - } + if ((flags & MPOL_MF_MOVE_ALL) || + (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) + if (!isolate_hugetlb(folio, qp->pagelist)) + qp->nr_failed++; unlock: spin_unlock(ptl); -#else - BUG(); + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; #endif - return ret; + return 0; } #ifdef CONFIG_NUMA_BALANCING @@ -665,12 +693,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, @@ -678,7 +700,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; - unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ @@ -706,19 +727,11 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, !(flags & MPOL_MF_STRICT)) return 1; - if (endvma > end) - endvma = end; - - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && - !(vma->vm_flags & VM_MIXEDMAP)) - change_prot_numa(vma, start, endvma); - return 1; - } - - /* queue pages from current vma */ - if (flags & MPOL_MF_VALID) + /* + * Check page nodes, and queue pages to move, in the current vma. + * But if no moving, and no strict checking, the scan can be skipped. + */ + if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } @@ -740,22 +753,21 @@ static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { /* * Walk through page tables and collect pages to be migrated. * - * If pages found in a given range are on a set of nodes (determined by - * @nodes and @flags,) it's isolated and queued to the pagelist which is - * passed via @private. + * If pages found in a given range are not on the required set of @nodes, + * and migration is allowed, they are isolated and queued to @pagelist. * - * queue_pages_range() has three possible return values: - * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. - * 0 - queue pages successfully or no misplaced page. - * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or - * memory range specified by nodemask and maxnode points outside - * your accessible address space (-EFAULT) + * queue_pages_range() may return: + * 0 - all pages already on the right node, or successfully queued for moving + * (or neither strict checking nor moving requested: only range checking). + * >0 - this number of misplaced folios could not be queued for moving + * (a hugetlbfs page or a transparent huge page being counted as 1). + * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. + * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ -static int +static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist, bool lock_vma) + struct list_head *pagelist) { int err; struct queue_pages qp = { @@ -765,20 +777,17 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .start = start, .end = end, .first = NULL, - .has_unmovable = false, }; - const struct mm_walk_ops *ops = lock_vma ? + const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); - if (qp.has_unmovable) - err = 1; if (!qp.first) /* whole range in hole */ err = -EFAULT; - return err; + return err ? : qp.nr_failed; } /* @@ -786,7 +795,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, - struct mempolicy *pol) + struct mempolicy *pol) { int err; struct mempolicy *old; @@ -794,11 +803,6 @@ static int vma_replace_policy(struct vm_area_struct *vma, vma_assert_write_locked(vma); - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); @@ -837,7 +841,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, vmstart = vma->vm_start; } - if (mpol_equal(vma_policy(vma), new_pol)) { + if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } @@ -894,8 +898,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; + current->il_weight = 0; + } task_unlock(current); mpol_put(old); ret = 0; @@ -909,18 +916,19 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, * * Called with task's alloc_lock held */ -static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (pol == &default_policy) return; - switch (p->mode) { + switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: - *nodes = p->nodes; + case MPOL_WEIGHTED_INTERLEAVE: + *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ @@ -967,6 +975,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We @@ -978,10 +987,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr, NULL); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -1006,6 +1012,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); + } else if (pol == current->mempolicy && + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + if (current->il_weight) + *policy = current->il_prev; + else + *policy = next_node_in(current->il_prev, + pol->nodes); } else { err = -EINVAL; goto out; @@ -1041,12 +1054,12 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* - * We try to migrate only unshared folios. If it is shared it - * is likely not worth migrating. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_likely_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. @@ -1057,32 +1070,31 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); - } else if (flags & MPOL_MF_STRICT) { + } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be - * isolated, so they can't be moved at the moment. It - * should return -EIO for this case too. + * isolated, so they can't be moved at the moment. */ - return -EIO; + return false; } } - - return 0; + return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -static int migrate_to_node(struct mm_struct *mm, int source, int dest, - int flags) +static long migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); - int err = 0; + long nr_failed; + long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, @@ -1091,25 +1103,32 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); - /* - * This does not "check" the range but isolates all pages that - * need migration. Between passing in the full user address - * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. - */ + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); vma = find_vma(mm, 0); if (unlikely(!vma)) return 0; - VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); + + /* + * This does not migrate the range, but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, + * but passes back the count of pages which could not be isolated. + */ + nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } + if (err >= 0) + err += nr_failed; return err; } @@ -1123,7 +1142,8 @@ static int migrate_area_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); - int err = 0; + long nr_failed; + long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, @@ -1132,23 +1152,30 @@ static int migrate_area_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); + vma = find_vma(mm, 0); + /* - * This does not "check" the range but isolates all pages that + * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address - * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, + * but passes back the count of pages which could not be isolated. */ - vma = find_vma(mm, 0); - VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, start, end, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); + nr_failed = queue_pages_range(mm, start, end, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_DAMON_DEMOTION, NULL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } + if (err >= 0) + err += nr_failed; return err; } @@ -1162,14 +1189,12 @@ static int migrate_area_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { - int busy = 0; - int err = 0; + long nr_failed = 0; + long err = 0; nodemask_t tmp; lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1245,17 +1270,15 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) - busy += err; + nr_failed += err; if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) return err; - return busy; - + return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* @@ -1268,14 +1291,12 @@ int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, unsigned long start, unsigned long end, int flags) { - int busy = 0; - int err = 0; + long nr_failed = 0; + long err = 0; nodemask_t tmp; lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1351,59 +1372,58 @@ int do_migrate_area_pages(struct mm_struct *mm, const nodemask_t *from, node_clear(source, tmp); err = migrate_area_to_node(mm, source, dest, start, end, flags); if (err > 0) - busy += err; + nr_failed += err; if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) return err; - return busy; - + return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; + struct migration_mpol *mmpol = (struct migration_mpol *)private; + struct mempolicy *pol = mmpol->pol; + pgoff_t ilx = mmpol->ilx; + struct page *page; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - for_each_vma(vmi, vma) { - address = page_address_in_vma(&src->page, vma); - if (address != -EFAULT) - break; - } + order = folio_order(src); + ilx += src->index >> order; if (folio_test_hugetlb(src)) { - return alloc_hugetlb_folio_vma(folio_hstate(src), - vma, address); + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - /* - * if !vma, vma_alloc_folio() will use task or system default policy - */ - return vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); + return page_rmappable_folio(page); } #else -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - return -EIO; + return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, @@ -1412,7 +1432,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1424,10 +1445,11 @@ long __do_mbind(unsigned long start, unsigned long len, { struct vm_area_struct *vma, *prev; struct vma_iterator vmi; + struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; - int err; - int ret; + long err; + long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) @@ -1453,9 +1475,6 @@ long __do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1463,14 +1482,8 @@ long __do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", - start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); - } { NODEMASK_SCRATCH(scratch); if (scratch) { @@ -1486,45 +1499,82 @@ long __do_mbind(unsigned long start, unsigned long len, goto mpol_out; /* - * Lock the VMAs before scanning for pages to migrate, to ensure we don't - * miss a concurrently inserted page. + * Lock the VMAs before scanning for pages to migrate, + * to ensure we don't miss a concurrently inserted page. */ - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist, true); - - if (ret < 0) { - err = ret; - goto up_out; - } + nr_failed = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); - vma_iter_init(&vmi, mm, start); - prev = vma_prev(&vmi); - for_each_vma_range(vmi, vma, end) { - err = mbind_range(&vmi, vma, &prev, start, end, new); - if (err) - break; + if (nr_failed < 0) { + err = nr_failed; + nr_failed = 0; + } else { + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } } - if (!err) { - int nr_failed = 0; - - if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_folio, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); - if (nr_failed) - putback_movable_pages(&pagelist); + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } + mmpol.pol = new; + mmpol.ilx = 0; - if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT)) - err = -EIO; - } else { -up_out: - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); + /* + * In the interleaved case, attempt to allocate on exactly the + * targeted nodes, for the first VMA to be migrated; for later + * VMAs, the nodes will still be interleaved from the targeted + * nodemask, but one by one may be selected differently. + */ + if (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE) { + struct page *page; + unsigned int order; + unsigned long addr = -EFAULT; + + list_for_each_entry(page, &pagelist, lru) { + if (!PageKsm(page)) + break; + } + if (!list_entry_is_head(page, &pagelist, lru)) { + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + addr = page_address_in_vma(page, vma); + if (addr != -EFAULT) + break; + } + } + if (addr != -EFAULT) { + order = compound_order(page); + /* We already know the pol, but not the ilx */ + mpol_cond_put(get_vma_policy(vma, addr, order, + &mmpol.ilx)); + /* Set base from which to increment by index */ + mmpol.ilx -= page->index >> order; + } + } } mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)&mmpol, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); + } + + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) @@ -1634,8 +1684,7 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX || - (unsigned int)(*mode) == MPOL_WEIGHTED_INTERLEAVE) + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1853,7 +1902,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, out_put: put_task_struct(task); goto out; - } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, @@ -1863,7 +1911,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } - /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -1930,34 +1977,20 @@ bool vma_migratable(struct vm_area_struct *vma) } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr, NULL); - } else if (vma->vm_policy) { - pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } - } - - return pol; + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* - * get_vma_policy(@vma, @addr) + * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or + * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1966,14 +1999,19 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = __get_vma_policy(vma, addr); + struct mempolicy *pol; + pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } return pol; } @@ -1983,8 +2021,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; + pgoff_t ilx; /* ignored here */ - pol = vma->vm_ops->get_policy(vma, vma->vm_start, NULL); + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); @@ -2019,75 +2058,43 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { - int mode = policy->mode; - - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) - return &policy->nodes; - - if (mode == MPOL_PREFERRED_MANY) - return &policy->nodes; - - return NULL; -} + unsigned int node; + unsigned int cpuset_mems_cookie; -/* - * Return the preferred node id for 'prefer' mempolicy, and return - * the given id for all other policies. - * - * policy_node() is always coupled with policy_nodemask(), which - * secures the nodemask limit for 'bind' and 'prefer-many' policy. - */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) -{ - if (policy->mode == MPOL_PREFERRED) { - nd = first_node(policy->nodes); - } else { - /* - * __GFP_THISNODE shouldn't even be used with the bind policy - * because we might easily break the expectation to stay on the - * requested node and not break the policy. - */ - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); +retry: + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ + cpuset_mems_cookie = read_mems_allowed_begin(); + node = current->il_prev; + if (!current->il_weight || !node_isset(node, policy->nodes)) { + node = next_node_in(node, policy->nodes); + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry; + if (node == MAX_NUMNODES) + return node; + current->il_prev = node; + current->il_weight = get_il_weight(node); } - - if ((policy->mode == MPOL_BIND || - policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; - - /* - * In nofallback mode, the remote node is not in zonelists, - * set remote node as preferred_nid or it will be skipped. - * MPOL_PREFERRED_MANY is not supported, becase at least - * one remote node that will be skipped. - */ - if (policy->mode == MPOL_BIND) { - if (numa_remote_nofallback(first_node(policy->nodes))) - return first_node(policy->nodes); - } - - return nd; + current->il_weight--; + return node; } /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static unsigned int interleave_nodes(struct mempolicy *policy) { - unsigned next; - struct task_struct *me = current; + unsigned int nid; + unsigned int cpuset_mems_cookie; + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nid = next_node_in(current->il_prev, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); - next = next_node_in(me->il_prev, policy->nodes); - if (next < MAX_NUMNODES) - me->il_prev = next; - return next; + if (nid < MAX_NUMNODES) + current->il_prev = nid; + return nid; } /* @@ -2113,6 +2120,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); + case MPOL_WEIGHTED_INTERLEAVE: + return weighted_interleave_nodes(policy); + case MPOL_BIND: case MPOL_PREFERRED_MANY: { @@ -2137,56 +2147,140 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, + nodemask_t *mask) +{ + /* + * barrier stabilizes the nodemask locally so that it can be iterated + * over safely without concern for changes. Allocators validate node + * selection does not violate mems_allowed, so this is safe. + */ + barrier(); + memcpy(mask, &pol->nodes, sizeof(nodemask_t)); + barrier(); + return nodes_weight(*mask); +} + +static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) +{ + nodemask_t nodemask; + unsigned int target, nr_nodes; + u8 *table; + unsigned int weight_total = 0; + u8 weight; + int nid; + + nr_nodes = read_once_policy_nodemask(pol, &nodemask); + if (!nr_nodes) + return numa_node_id(); + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* calculate the total weight */ + for_each_node_mask(nid, nodemask) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + weight_total += weight; + } + + /* Calculate the node offset based on totals */ + target = ilx % weight_total; + nid = first_node(nodemask); + while (target) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + rcu_read_unlock(); + return nid; +} + /* - * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the - * number of present nodes. + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; - /* - * The barrier will stabilize the nodemask in a register or on - * the stack so that it will stop changing under the code. - * - * Between first_node() and next_node(), pol->nodes could be changed - * by other threads. So we put pol->nodes in a local stack. - */ - barrier(); - nnodes = nodes_weight(nodemask); + nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - if (vma) { - unsigned long off; + nodemask_t *nodemask = NULL; + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) { + *nid = pol->home_node; + /* + * In nofallback mode, the remote node is not in zonelists, + * set remote node as preferred_nid or it will be skipped. + * MPOL_PREFERRED_MANY is not supported, becase at least + * one remote node that will be skipped. + */ + } else if (numa_remote_nofallback(first_node(pol->nodes))) { + *nid = first_node(pol->nodes); + } /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, off); - } else - return interleave_nodes(pol); + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + *nid = NUMA_NO_NODE; + if (smart_grid_used()) + *nid = sched_grid_preferred_interleave_nid(pol); + /* Override input node id */ + if (*nid == NUMA_NO_NODE) + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + case MPOL_WEIGHTED_INTERLEAVE: + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + weighted_interleave_nodes(pol) : + weighted_interleave_nid(pol, ilx); + break; + } + + return nodemask; } #ifdef CONFIG_HUGETLBFS @@ -2202,27 +2296,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. - * - * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, - struct mempolicy **mpol, nodemask_t **nodemask) + struct mempolicy **mpol, nodemask_t **nodemask) { + pgoff_t ilx; int nid; - int mode; - - *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; - mode = (*mpol)->mode; - if (unlikely(mode == MPOL_INTERLEAVE)) { - nid = interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))); - } else { - nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) - *nodemask = &(*mpol)->nodes; - } + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } @@ -2256,6 +2339,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2300,27 +2384,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) -{ - struct page *page; - - page = __alloc_pages(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; - if (page && page_to_nid(page) == nid) { - preempt_disable(); - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); - preempt_enable(); - } - return page; -} - static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, struct mempolicy *pol) + int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2333,7 +2398,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); @@ -2341,59 +2406,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: For hugepages try only the preferred node if possible. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * Allocate a folio for a specific address in @vma, using the appropriate - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock - * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for folios that will be mapped into user space. - * - * Return: The folio on success or NULL if allocation fails. + * Return: The page on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; - int node = numa_node_id(); - struct folio *folio; - int preferred_nid; - nodemask_t *nmask; - - pol = get_vma_policy(vma, addr); - - if (pol->mode == MPOL_INTERLEAVE) { - struct page *page; - int nid = NUMA_NO_NODE; - - if (smart_grid_used()) - nid = sched_grid_preferred_interleave_nid(pol); - if (nid == NUMA_NO_NODE) - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - - mpol_cond_put(pol); - gfp |= __GFP_COMP; - page = alloc_page_interleave(gfp, order, nid); - return page_rmappable_folio(page); - } + nodemask_t *nodemask; + struct page *page; - if (pol->mode == MPOL_PREFERRED_MANY) { - struct page *page; + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - node = policy_node(gfp, pol, node); - gfp |= __GFP_COMP; - page = alloc_pages_preferred_many(gfp, order, node, pol); - mpol_cond_put(pol); - return page_rmappable_folio(page); - } - - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -2404,41 +2439,70 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED) - hpage_node = first_node(pol->nodes); - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); + if (pol->mode != MPOL_INTERLEAVE && + pol->mode != MPOL_WEIGHTED_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - folio = __folio_alloc_node(gfp | __GFP_THISNODE | - __GFP_NORETRY, order, hpage_node); - + page = __alloc_pages_node(nid, + gfp | __GFP_THISNODE | __GFP_NORETRY, order); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, - nmask); + } + } - goto out; + page = __alloc_pages(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); } } - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); - if (smart_grid_used()) - preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + return page; +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @hugepage: Unused (was: For hugepages try only preferred node if possible). + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of alloc_pages_mpol() is more appropriate. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) +{ + struct mempolicy *pol; + pgoff_t ilx; + struct page *page; + + pol = get_vma_policy(vma, addr, order, &ilx); + page = alloc_pages_mpol(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); -out: - return folio; + return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); @@ -2456,33 +2520,23 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; - struct page *page; - - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else if (pol->mode == MPOL_PREFERRED_MANY) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else - page = __alloc_pages(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); - return page; + return alloc_pages_mpol(gfp, order, + pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } @@ -2523,6 +2577,121 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } +static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct task_struct *me = current; + unsigned int cpuset_mems_cookie; + unsigned long total_allocated = 0; + unsigned long nr_allocated = 0; + unsigned long rounds; + unsigned long node_pages, delta; + u8 *table, *weights, weight; + unsigned int weight_total = 0; + unsigned long rem_pages = nr_pages; + nodemask_t nodes; + int nnodes, node; + int resume_node = MAX_NUMNODES - 1; + u8 resume_weight = 0; + int prev_node; + int i; + + if (!nr_pages) + return 0; + + /* read the nodes onto the stack, retry if done during rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nnodes = read_once_policy_nodemask(pol, &nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + /* if the nodemask has become invalid, we cannot do anything */ + if (!nnodes) + return 0; + + /* Continue allocating from most recent node and adjust the nr_pages */ + node = me->il_prev; + weight = me->il_weight; + if (weight && node_isset(node, nodes)) { + node_pages = min(rem_pages, weight); + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (rem_pages <= weight) { + me->il_weight -= rem_pages; + return total_allocated; + } + /* Otherwise we adjust remaining pages, continue from there */ + rem_pages -= weight; + } + /* clear active weight in case of an allocation failure */ + me->il_weight = 0; + prev_node = node; + + /* create a local copy of node weights to operate on outside rcu */ + weights = kzalloc(nr_node_ids, GFP_KERNEL); + if (!weights) + return total_allocated; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + if (table) + memcpy(weights, table, nr_node_ids); + rcu_read_unlock(); + + /* calculate total, detect system default usage */ + for_each_node_mask(node, nodes) { + if (!weights[node]) + weights[node] = 1; + weight_total += weights[node]; + } + + /* + * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. + * Track which node weighted interleave should resume from. + * + * if (rounds > 0) and (delta == 0), resume_node will always be + * the node following prev_node and its weight. + */ + rounds = rem_pages / weight_total; + delta = rem_pages % weight_total; + resume_node = next_node_in(prev_node, nodes); + resume_weight = weights[resume_node]; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, nodes); + weight = weights[node]; + node_pages = weight * rounds; + /* If a delta exists, add this node's portion of the delta */ + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + /* when delta is depleted, resume from that node */ + node_pages += delta; + resume_node = node; + resume_weight = weight - delta; + delta = 0; + } + /* node_pages can be 0 if an allocation fails and rounds == 0 */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + if (total_allocated == nr_pages) + break; + prev_node = node; + } + me->il_prev = resume_node; + me->il_weight = resume_weight; + kfree(weights); + return total_allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2553,6 +2722,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); @@ -2561,18 +2732,23 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_array_weighted_interleave( + gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return __alloc_pages_bulk(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { - struct mempolicy *pol = mpol_dup(vma_policy(src)); + struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); @@ -2635,6 +2811,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2657,8 +2834,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ -static struct sp_node * -sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +static struct sp_node *sp_lookup(struct shared_policy *sp, + pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; @@ -2709,13 +2886,11 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ -struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; @@ -2756,23 +2931,25 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; + pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); - unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; - pol = get_vma_policy(vma, addr); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, pgoff); + polnid = interleave_nid(pol, ilx); + break; + + case MPOL_WEIGHTED_INTERLEAVE: + polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: @@ -2849,7 +3026,6 @@ void mpol_put_task_policy(struct task_struct *task) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } @@ -2884,8 +3060,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, } /* Replace a policy range. */ -static int shared_policy_replace(struct shared_policy *sp, unsigned long start, - unsigned long end, struct sp_node *new) +static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, + pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; @@ -2968,30 +3144,30 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) rwlock_init(&sp->lock); if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; + struct sp_node *sn; + struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ - new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) + + /* contextualize the tmpfs mount point mempolicy to this file */ + npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) - goto put_new; - - /* Create pseudo-vma that contains just the policy */ - vma_init(&pvma, NULL); - pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: - mpol_put(new); /* drop initial ref */ + goto put_npol; + + /* alloc node covering entire file; adds ref to file's npol */ + sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); + if (sn) + sp_insert(sp, sn); +put_npol: + mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: @@ -2999,46 +3175,40 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) } } -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, struct mempolicy *npol) +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", - vma->vm_pgoff, - sz, npol ? npol->mode : -1, - npol ? npol->flags : -1, - npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); - - if (npol) { - new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (pol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } - err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ -void mpol_free_shared_policy(struct shared_policy *p) +void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; - if (!p->root.rb_node) + if (!sp->root.rb_node) return; - write_lock(&p->lock); - next = rb_first(&p->root); + write_lock(&sp->lock); + next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - sp_delete(p, n); + sp_delete(sp, n); } - write_unlock(&p->lock); + write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING @@ -3088,7 +3258,6 @@ static inline void __init check_numabalancing_enable(void) } #endif /* CONFIG_NUMA_BALANCING */ -/* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; @@ -3151,7 +3320,6 @@ void numa_default_policy(void) /* * Parse and format mempolicy from/to strings */ - static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -3163,7 +3331,6 @@ static const char * const policy_modes[] = [MPOL_PREFERRED_MANY] = "prefer (many)", }; - #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. @@ -3219,6 +3386,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } break; case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ @@ -3333,6 +3501,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: @@ -3365,3 +3534,197 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + weight = get_il_weight(node_attr->nid); + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct iw_node_attr *node_attr; + u8 *new; + u8 *old; + u8 weight = 0; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "")) + weight = 0; + else if (kstrtou8(buf, 0, &weight)) + return -EINVAL; + + new = kzalloc(nr_node_ids, GFP_KERNEL); + if (!new) + return -ENOMEM; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + if (old) + memcpy(new, old, nr_node_ids); + new[node_attr->nid] = weight; + rcu_assign_pointer(iw_table, new); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + return count; +} + +static struct iw_node_attr **node_attrs; + +static void sysfs_wi_node_delete(struct iw_node_attr *node_attr, + struct kobject *parent) +{ + if (!node_attr) + return; + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); +} + +static void sysfs_wi_node_delete_all(struct kobject *wi_kobj) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) + sysfs_wi_node_delete(node_attrs[nid], wi_kobj); +} + +static void iw_table_free(void) +{ + u8 *old; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + rcu_assign_pointer(iw_table, NULL); + mutex_unlock(&iw_table_lock); + + synchronize_rcu(); + kfree(old); +} + +static void wi_cleanup(struct kobject *wi_kobj) { + sysfs_wi_node_delete_all(wi_kobj); + iw_table_free(); + kfree(node_attrs); +} + +static void wi_kobj_release(struct kobject *wi_kobj) +{ + kfree(wi_kobj); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = wi_kobj_release, +}; + +static int add_weight_node(int nid, struct kobject *wi_kobj) +{ + struct iw_node_attr *node_attr; + char *name; + + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); + if (!node_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(node_attr); + return -ENOMEM; + } + + sysfs_attr_init(&node_attr->kobj_attr.attr); + node_attr->kobj_attr.attr.name = name; + node_attr->kobj_attr.attr.mode = 0644; + node_attr->kobj_attr.show = node_show; + node_attr->kobj_attr.store = node_store; + node_attr->nid = nid; + + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); + pr_err("failed to add attribute to weighted_interleave\n"); + return -ENOMEM; + } + + node_attrs[nid] = node_attr; + return 0; +} + +static int add_weighted_interleave_group(struct kobject *root_kobj) +{ + struct kobject *wi_kobj; + int nid, err; + + node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), + GFP_KERNEL); + if (!node_attrs) + return -ENOMEM; + + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!wi_kobj) { + kfree(node_attrs); + return -ENOMEM; + } + + err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + "weighted_interleave"); + if (err) + goto err_put_kobj; + + for_each_node_state(nid, N_POSSIBLE) { + err = add_weight_node(nid, wi_kobj); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + goto err_cleanup_kobj; + } + } + + return 0; + +err_cleanup_kobj: + wi_cleanup(wi_kobj); + kobject_del(wi_kobj); +err_put_kobj: + kobject_put(wi_kobj); + return err; +} + +static int __init mempolicy_sysfs_init(void) +{ + int err; + static struct kobject *mempolicy_kobj; + + mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); + if (!mempolicy_kobj) + return -ENOMEM; + + err = add_weighted_interleave_group(mempolicy_kobj); + if (err) + goto err_kobj; + + return 0; + +err_kobj: + kobject_del(mempolicy_kobj); + kobject_put(mempolicy_kobj); + return err; +} + +late_initcall(mempolicy_sysfs_init); +#endif /* CONFIG_SYSFS */ diff --git a/mm/shmem.c b/mm/shmem.c index 1cb7ec88d991b6e277566c0ce1e3220651361c27..3047c7f4124b08b2ea31d15894008b1d39128c14 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1684,38 +1684,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} - -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; + struct mempolicy *mpol; + pgoff_t ilx; struct folio *folio; - struct vm_fault vmf = { - .vma = &pvma, - }; - shmem_pseudo_vma_init(&pvma, info, index); - folio = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + folio = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); return folio; } @@ -1837,14 +1819,15 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, order, &pvma, 0, order == PMD_ORDER); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return page_rmappable_folio(page); } static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, @@ -2310,7 +2293,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, skip_swapcache = true; } else { /* Cached swapin only supports order 0 folio */ - folio = shmem_swapin(swap, gfp, info, index); + folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; @@ -2848,10 +2831,36 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { diff --git a/mm/swap.h b/mm/swap.h index 26fa536a89477eb8de8d0e8872146cf8932549cf..368e4f1e1ea2ef5670d24a495ceac19be6f767de 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,8 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +struct mempolicy; + #ifdef CONFIG_SWAP #include /* for bio_end_io_t */ @@ -47,11 +49,10 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); + struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); diff --git a/mm/swap_state.c b/mm/swap_state.c index 52b3e5c4d1ade74050d06449bfd4952efc03fa8e..840e0f93bf7d0f57f16c7e558533b18365c2df5f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -426,8 +427,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) { struct swap_info_struct *si; struct folio *folio; @@ -466,7 +467,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; @@ -540,13 +542,18 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { - bool page_was_allocated; - struct folio *folio = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; - if (page_was_allocated) - swap_read_folio(folio, plug); + mpol = get_vma_policy(vma, addr, 0, &ilx); + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + mpol_cond_put(mpol); + if (page_allocated) + swap_read_folio(folio, plug); return folio; } @@ -615,7 +622,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct folio for entry and addr, after queueing swapin. * @@ -624,13 +632,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) + struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); @@ -641,8 +648,6 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -660,8 +665,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ folio = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated); if (!folio) continue; if (page_allocated) { @@ -675,11 +680,14 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } blk_finish_plug(&plug); swap_read_unplug(splug); - lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_read_folio(folio, NULL); + return folio; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -777,8 +785,10 @@ static void swap_ra_info(struct vm_fault *vmf, /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. @@ -789,16 +799,17 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct folio *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, +static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct vm_area_struct *vma = vmf->vma; struct folio *folio; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { @@ -810,9 +821,10 @@ static struct folio *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -826,8 +838,8 @@ static struct folio *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - folio = __read_swap_cache_async(entry, gfp_mask, vma, - addr, &page_allocated); + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!folio) continue; if (page_allocated) { @@ -845,9 +857,12 @@ static struct folio *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, swap_read_unplug(splug); lru_add_drain(); skip: - /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - NULL); + /* The folio was likely read above, so no need for plugging here */ + folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_read_folio(folio, NULL); + return folio; } /** @@ -865,11 +880,15 @@ static struct folio *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { + struct mempolicy *mpol; + pgoff_t ilx; struct folio *folio; + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); folio = swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); if (!folio) return NULL; diff --git a/mm/zswap.c b/mm/zswap.c index dcf44712f0bf5f616684453829b9f69f76dbd5a0..8e28011bfbd80548cb41599ffc84b73525e9b245 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1063,6 +1064,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, { swp_entry_t swpentry = entry->swpentry; struct folio *folio; + struct mempolicy *mpol; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); @@ -1081,8 +1083,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* try to allocate swap cache folio */ - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, - &folio_was_allocated); + mpol = get_task_policy(current); + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated); if (!folio) { ret = -ENOMEM; goto fail;