From d634f94d18f323f3e037d1094b70ba9db3196140 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 27 Apr 2023 19:13:41 +0800 Subject: [PATCH 01/13] mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy mainline inclusion from mainline-v5.14-rc1 commit 7858d7bca7fbbbbd5b940d2ec371b2d060b21b84 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7858d7bca7fbbbbd5b940d2ec371b2d060b21b84 -------------------------------- MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Ben Widawsky Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng --- include/uapi/linux/mempolicy.h | 1 - mm/mempolicy.c | 136 ++++++++++++++------------------- 2 files changed, 56 insertions(+), 81 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 3354774af61e..5c90176df4ab 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -58,7 +58,6 @@ enum { * are never OR'ed into the mode in mempolicy API arguments. */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ -#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e2927e81c738..26497b2fcd96 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -121,8 +121,7 @@ enum zone_type policy_zone = 0; */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ - .mode = MPOL_PREFERRED, - .flags = MPOL_F_LOCAL, + .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; @@ -236,12 +235,9 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - if (!nodes) - pol->flags |= MPOL_F_LOCAL; /* local allocation */ - else if (nodes_empty(*nodes)) - return -EINVAL; /* no allowed nodes */ - else - pol->v.preferred_node = first_node(*nodes); + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.preferred_node = first_node(*nodes); return 0; } @@ -256,8 +252,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes - * parameter with respect to the policy mode and flags. But, we need to - * handle an empty nodemask with MPOL_PREFERRED here. + * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. @@ -267,9 +262,14 @@ static int mpol_set_nodemask(struct mempolicy *pol, { int ret; - /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ - if (pol == NULL) + /* + * Default (pol==NULL) resp. local memory policies are not a + * subject of any remapping. They also do not need any special + * constructor. + */ + if (!pol || pol->mode == MPOL_LOCAL) return 0; + /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); @@ -278,25 +278,18 @@ static int mpol_set_nodemask(struct mempolicy *pol, nodes_or(nsc->mask1, cdmmask, nsc->mask1); #endif VM_BUG_ON(!nodes); - if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) - nodes = NULL; /* explicit local allocation */ - else { - if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); - else - nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (mpol_store_user_nodemask(pol)) - pol->w.user_nodemask = *nodes; - else - pol->w.cpuset_mems_allowed = - cpuset_current_mems_allowed; - } + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (nodes) - ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; else - ret = mpol_ops[pol->mode].create(pol, NULL); + pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; + + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } @@ -329,13 +322,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); + + mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); - mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -383,25 +377,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - nodemask_t tmp; - - if (pol->flags & MPOL_F_STATIC_NODES) { - int node = first_node(pol->w.user_nodemask); - - if (node_isset(node, *nodes)) { - pol->v.preferred_node = node; - pol->flags &= ~MPOL_F_LOCAL; - } else - pol->flags |= MPOL_F_LOCAL; - } else if (pol->flags & MPOL_F_RELATIVE_NODES) { - mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); - pol->v.preferred_node = first_node(tmp); - } else if (!(pol->flags & MPOL_F_LOCAL)) { - pol->v.preferred_node = node_remap(pol->v.preferred_node, - pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; - } + pol->w.cpuset_mems_allowed = *nodes; } /* @@ -415,7 +391,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; - if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -466,6 +442,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, + [MPOL_LOCAL] = { + .rebind = mpol_rebind_default, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -946,10 +925,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; + case MPOL_LOCAL: + /* return empty node mask for local allocation */ + break; + case MPOL_PREFERRED: - if (!(p->flags & MPOL_F_LOCAL)) - node_set(p->v.preferred_node, *nodes); - /* else return empty node mask for local allocation */ + node_set(p->v.preferred_node, *nodes); break; default: BUG(); @@ -1928,9 +1909,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) /* Return the node id preferred by the given mempolicy, or the given id */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { - if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + if (policy->mode == MPOL_PREFERRED) { nd = policy->v.preferred_node; - else { + } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the @@ -1974,14 +1955,11 @@ unsigned int mempolicy_slab_node(void) return node; policy = current->mempolicy; - if (!policy || policy->flags & MPOL_F_LOCAL) + if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: - /* - * handled MPOL_F_LOCAL above - */ return policy->v.preferred_node; case MPOL_INTERLEAVE: @@ -2001,6 +1979,8 @@ unsigned int mempolicy_slab_node(void) &policy->v.nodes); return z->zone ? zone_to_nid(z->zone) : node; } + case MPOL_LOCAL: + return node; default: BUG(); @@ -2122,16 +2102,18 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: - if (mempolicy->flags & MPOL_F_LOCAL) - nid = numa_node_id(); - else - nid = mempolicy->v.preferred_node; + nid = mempolicy->v.preferred_node; init_nodemask_of_node(mask, nid); break; case MPOL_BIND: case MPOL_INTERLEAVE: - *mask = mempolicy->v.nodes; + *mask = mempolicy->v.nodes; + break; + + case MPOL_LOCAL: + nid = numa_node_id(); + init_nodemask_of_node(mask, nid); break; default: @@ -2254,7 +2236,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) + if (pol->mode == MPOL_PREFERRED) hpage_node = pol->v.preferred_node; nmask = policy_nodemask(gfp, pol); @@ -2390,10 +2372,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - /* a's ->flags is the same as b's */ - if (a->flags & MPOL_F_LOCAL) - return true; return a->v.preferred_node == b->v.preferred_node; + case MPOL_LOCAL: + return true; default: BUG(); return false; @@ -2531,10 +2512,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: - if (pol->flags & MPOL_F_LOCAL) - polnid = numa_node_id(); - else - polnid = pol->v.preferred_node; + polnid = pol->v.preferred_node; + break; + + case MPOL_LOCAL: + polnid = numa_node_id(); break; case MPOL_BIND: @@ -2900,9 +2882,6 @@ void numa_default_policy(void) * Parse and format mempolicy from/to strings */ -/* - * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. - */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2980,7 +2959,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) */ if (nodelist) goto out; - mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* @@ -3024,7 +3002,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) else if (nodelist) new->v.preferred_node = first_node(nodes); else - new->flags |= MPOL_F_LOCAL; + new->mode = MPOL_LOCAL; /* * Save nodes for contextualization: this will be used to "clone" @@ -3070,12 +3048,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) switch (mode) { case MPOL_DEFAULT: + case MPOL_LOCAL: break; case MPOL_PREFERRED: - if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; - else - node_set(pol->v.preferred_node, nodes); + node_set(pol->v.preferred_node, nodes); break; case MPOL_BIND: case MPOL_INTERLEAVE: -- Gitee From 6b918c6f4d20f5e6f0e82983bf52708ee24c67fa Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 27 Apr 2023 19:13:42 +0800 Subject: [PATCH 02/13] mm/mempolicy: unify the parameter sanity check for mbind and set_mempolicy mainline inclusion from mainline-v5.14-rc1 commit 95837924587c60425f941dc8cbfba61cb964fcb5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=95837924587c60425f941dc8cbfba61cb964fcb5 -------------------------------- Currently the kernel_mbind() and kernel_set_mempolicy() do almost the same operation for parameter sanity check. Add a helper function to unify the code to reduce the redundancy, and make it easier for changing the sanity check code in future. [thanks to David Rientjes for suggesting using helper function instead of macro]. [feng.tang@intel.com: add comment] Link: https://lkml.kernel.org/r/1622560492-1294-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-4-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: David Rientjes Cc: Dave Hansen Cc: Ben Widawsky Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng --- mm/mempolicy.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 26497b2fcd96..bd74753c11ed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1474,26 +1474,38 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } +/* Basic parameter sanity check used by both mbind() and set_mempolicy() */ +static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) +{ + *flags = *mode & MPOL_MODE_FLAGS; + *mode &= ~MPOL_MODE_FLAGS; + if ((unsigned int)(*mode) >= MPOL_MAX) + return -EINVAL; + if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + + return 0; +} + static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { + unsigned short mode_flags; nodemask_t nodes; + int lmode = mode; int err; - unsigned short mode_flags; start = untagged_addr(start); - mode_flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if (mode >= MPOL_MAX) - return -EINVAL; - if ((mode_flags & MPOL_F_STATIC_NODES) && - (mode_flags & MPOL_F_RELATIVE_NODES)) - return -EINVAL; + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_mbind(start, len, mode, mode_flags, &nodes, flags); + + return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, @@ -1507,20 +1519,20 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { - int err; + unsigned short mode_flags; nodemask_t nodes; - unsigned short flags; + int lmode = mode; + int err; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; - flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)mode >= MPOL_MAX) - return -EINVAL; - if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) - return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_set_mempolicy(mode, flags, &nodes); + + return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, -- Gitee From 52e5df72c58e1c804c57cbcdfe37dc9c0a4f846d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 27 Apr 2023 19:13:43 +0800 Subject: [PATCH 03/13] mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes mainline inclusion from mainline-v5.15-rc1 commit b27abaccf8e8b012f126da0c2a1ab32723ec8b9f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b27abaccf8e8b012f126da0c2a1ab32723ec8b9f -------------------------------- Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Signed-off-by: Dave Hansen Signed-off-by: Feng Tang Cc: Michal Hocko Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying b Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/mempolicy.c Signed-off-by: Ma Wupeng --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 68 ++++++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 5c90176df4ab..7c4ffc207f67 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -22,6 +22,7 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, + MPOL_PREFERRED_MANY, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bd74753c11ed..8ae8c1782dc8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -31,6 +31,9 @@ * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -241,6 +244,14 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } +static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; +} + static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) @@ -445,6 +456,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_preferred_many, + .rebind = mpol_rebind_preferred, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -923,6 +938,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_MANY: *nodes = p->v.nodes; break; case MPOL_LOCAL: @@ -1479,7 +1495,13 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX) + + /* + * The check should be 'mode >= MPOL_MAX', but as 'prefer_many' + * is not fully implemented, don't permit it to be used for now, + * and the logic will be restored in following patch + */ + if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1908,17 +1930,28 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { + int mode = policy->mode; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && + if (unlikely(mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && (cpuset_nodemask_valid_mems_allowed(&policy->v.nodes) || nodemask_has_cdm(policy->v.nodes))) return &policy->v.nodes; + if (mode == MPOL_PREFERRED_MANY) + return &policy->v.nodes; + return NULL; } -/* Return the node id preferred by the given mempolicy, or the given id */ +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED) { @@ -1977,7 +2010,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { struct zoneref *z; /* @@ -2058,12 +2093,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy - * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ @@ -2071,16 +2106,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; + int mode; *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + *nodemask = NULL; + mode = (*mpol)->mode; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) + if ((*mpol)->mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) *nodemask = &(*mpol)->v.nodes; } return nid; @@ -2118,6 +2155,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) init_nodemask_of_node(mask, nid); break; + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->v.nodes; @@ -2245,7 +2283,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * - * If the policy is interleave, or does not allow the current + * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED) @@ -2384,6 +2422,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: return a->v.preferred_node == b->v.preferred_node; case MPOL_LOCAL: return true; @@ -2524,6 +2563,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: + if (node_isset(curnid, pol->v.nodes)) + goto out; polnid = pol->v.preferred_node; break; @@ -2533,8 +2574,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long case MPOL_BIND: + case MPOL_PREFERRED_MANY: /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. @@ -2901,6 +2942,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2979,6 +3021,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -3065,6 +3108,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED: node_set(pol->v.preferred_node, nodes); break; + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->v.nodes; -- Gitee From 831bd34c7f43971f201e39ab0b369d9e2a199c03 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 27 Apr 2023 19:13:44 +0800 Subject: [PATCH 04/13] mm/memplicy: add page allocation function for MPOL_PREFERRED_MANY policy mainline inclusion from mainline-v5.15-rc1 commit 4c54d94908e089e9741513797eac30a8b8217034 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c54d94908e089e9741513797eac30a8b8217034 -------------------------------- The semantics of MPOL_PREFERRED_MANY is similar to MPOL_PREFERRED, that it will first try to allocate memory from the preferred node(s), and fallback to all nodes in system when first try fails. Add a dedicated function alloc_pages_preferred_many() for it just like for 'interleave' policy, which will be used by 2 general memoory allocation APIs: alloc_pages() and alloc_pages_vma() Link: https://lore.kernel.org/r/20200630212517.308045-9-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-3-git-send-email-feng.tang@intel.com Suggested-by: Michal Hocko Originally-by: Ben Widawsky Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng --- mm/mempolicy.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8ae8c1782dc8..a9e615993d81 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2238,6 +2238,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->v.nodes); + if (!page) + page = __alloc_pages(gfp, order, numa_node_id(), NULL); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @gfp: GFP flags. @@ -2273,6 +2294,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (pol->mode == MPOL_PREFERRED_MANY) { + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + goto out; + } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; @@ -2350,6 +2377,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + numa_node_id(), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), -- Gitee From 27a782f719fdab073c5b0e19764fad1f93cddd00 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 27 Apr 2023 19:13:45 +0800 Subject: [PATCH 05/13] mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY mainline inclusion from mainline-v5.15-rc1 commit cfcaa66f803233c50e17239469f6c96136a673a1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cfcaa66f803233c50e17239469f6c96136a673a1 -------------------------------- Implement the missing huge page allocation functionality while obeying the preferred node semantics. This is similar to the implementation for general page allocation, as it uses a fallback mechanism to try multiple preferred nodes first, and then all other nodes. To avoid adding too many "#ifdef CONFIG_NUMA" check, add a helper function in mempolicy.h to check whether a mempolicy is MPOL_PREFERRED_MANY. [akpm@linux-foundation.org: fix compiling issue when merging with other hugetlb patch] [Thanks to 0day bot for catching the !CONFIG_NUMA compiling issue] [mhocko@suse.com: suggest to remove the #ifdef CONFIG_NUMA check] [ben.widawsky@intel.com: add helpers to avoid ifdefs] Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com [nathan@kernel.org: initialize page to NULL in alloc_buddy_huge_page_with_mpol()] Link: https://lkml.kernel.org/r/20210810200632.3812797-1-nathan@kernel.org Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Signed-off-by: Nathan Chancellor Co-developed-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: include/linux/mempolicy.h mm/hugetlb.c Signed-off-by: Ma Wupeng --- include/linux/mempolicy.h | 10 ++++++++++ mm/hugetlb.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ba74e7399dc6..8a55d0dd3c88 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -204,6 +204,11 @@ extern void mpol_put_task_policy(struct task_struct *); extern long __do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags, struct mm_struct *mm); + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return (pol->mode == MPOL_PREFERRED_MANY); +} #else struct mempolicy {}; @@ -319,5 +324,10 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { return NULL; } + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return false; +} #endif /* CONFIG_NUMA */ #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 63880f3e682d..45c227c9e4c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1154,7 +1154,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1175,7 +1175,19 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, mpol); + + if (mpol_is_preferred_many(mpol)) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, + mpol); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask, + mpol); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2193,16 +2205,26 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); - mpol_cond_put(mpol); + if (mpol_is_preferred_many(mpol)) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); return page; } -- Gitee From bf5c272663e6cf392f4321dcb2aa5c413ad7cdf9 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 27 Apr 2023 19:13:46 +0800 Subject: [PATCH 06/13] mm/mempolicy: advertise new MPOL_PREFERRED_MANY mainline inclusion from mainline-v5.15-rc1 commit a38a59fdfa10be55d08e4530923d950e739ac6a2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a38a59fdfa10be55d08e4530923d950e739ac6a2 -------------------------------- Adds a new mode to the existing mempolicy modes, MPOL_PREFERRED_MANY. MPOL_PREFERRED_MANY will be adequately documented in the internal admin-guide with this patch. Eventually, the man pages for mbind(2), get_mempolicy(2), set_mempolicy(2) and numactl(8) will also have text about this mode. Those shall contain the canonical reference. NUMA systems continue to become more prevalent. New technologies like PMEM make finer grain control over memory access patterns increasingly desirable. MPOL_PREFERRED_MANY allows userspace to specify a set of nodes that will be tried first when performing allocations. If those allocations fail, all remaining nodes will be tried. It's a straight forward API which solves many of the presumptive needs of system administrators wanting to optimize workloads on such machines. The mode will work either per VMA, or per thread. [Michal Hocko: refine kernel doc for MPOL_PREFERRED_MANY] Link: https://lore.kernel.org/r/20200630212517.308045-13-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-5-git-send-email-feng.tang@intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng --- .../admin-guide/mm/numa_memory_policy.rst | 15 +++++++++++---- mm/mempolicy.c | 7 +------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 067a90a1499c..64fd0ba0d057 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -245,6 +245,13 @@ MPOL_INTERLEAVED address range or file. During system boot up, the temporary interleaved system default policy works in this mode. +MPOL_PREFERRED_MANY + This mode specifices that the allocation should be preferrably + satisfied from the nodemask specified in the policy. If there is + a memory pressure on all nodes in the nodemask, the allocation + can fall back to all existing numa nodes. This is effectively + MPOL_PREFERRED allowed for a mask rather than a single node. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES @@ -253,10 +260,10 @@ MPOL_F_STATIC_NODES nodes changes after the memory policy has been defined. Without this flag, any time a mempolicy is rebound because of a - change in the set of allowed nodes, the node (Preferred) or - nodemask (Bind, Interleave) is remapped to the new set of - allowed nodes. This may result in nodes being used that were - previously undesired. + change in the set of allowed nodes, the preferred nodemask (Preferred + Many), preferred node (Preferred) or nodemask (Bind, Interleave) is + remapped to the new set of allowed nodes. This may result in nodes + being used that were previously undesired. With this flag, if the user-specified nodes overlap with the nodes allowed by the task's cpuset, then the memory policy is diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a9e615993d81..0be5f66d04ea 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1496,12 +1496,7 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - /* - * The check should be 'mode >= MPOL_MAX', but as 'prefer_many' - * is not fully implemented, don't permit it to be used for now, - * and the logic will be restored in following patch - */ - if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY) + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; -- Gitee From cb49a6277da792e32950cf7ffe24ea2e504fe0a9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 27 Apr 2023 19:13:47 +0800 Subject: [PATCH 07/13] mm/mempolicy: use policy_node helper with MPOL_PREFERRED_MANY mainline inclusion from mainline-v5.17-rc1 commit c04551162167368022a61899843821bbf015b473 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c04551162167368022a61899843821bbf015b473 -------------------------------- Patch series "mm: add new syscall set_mempolicy_home_node", v6. This patch (of 3): A followup patch will enable setting a home node with MPOL_PREFERRED_MANY memory policy. To facilitate that switch to using policy_node helper. There is no functional change in this patch. Link: https://lkml.kernel.org/r/20211202123810.267175-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20211202123810.267175-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng --- mm/mempolicy.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0be5f66d04ea..337c0e57075b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2249,7 +2249,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, &pol->v.nodes); if (!page) - page = __alloc_pages(gfp, order, numa_node_id(), NULL); + page = __alloc_pages(gfp, order, nid, NULL); return page; } @@ -2290,6 +2290,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); goto out; @@ -2374,7 +2375,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, - numa_node_id(), pol); + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), -- Gitee From 3b3547172fa3f32d55dce76932e91458604713a2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 27 Apr 2023 19:13:48 +0800 Subject: [PATCH 08/13] mm/mempolicy: add set_mempolicy_home_node syscall mainline inclusion from mainline-v5.17-rc1 commit c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 -------------------------------- This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: include/linux/mempolicy.h mm/mempolicy.c Signed-off-by: Ma Wupeng --- .../admin-guide/mm/numa_memory_policy.rst | 16 +++- include/linux/mempolicy.h | 1 + mm/mempolicy.c | 79 +++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 64fd0ba0d057..5a6afecbb0d0 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -408,7 +408,7 @@ follows: Memory Policy APIs ================== -Linux supports 3 system calls for controlling memory policy. These APIS +Linux supports 4 system calls for controlling memory policy. These APIS always affect only the calling task, the calling task's address space, or some shared object mapped into the calling task's address space. @@ -460,6 +460,20 @@ requested via the 'flags' argument. See the mbind(2) man page for more details. +Set home node for a Range of Task's Address Spacec:: + + long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); + +sys_set_mempolicy_home_node set the home node for a VMA policy present in the +task's address range. The system call updates the home node only for the existing +mempolicy range. Other address ranges are ignored. A home node is the NUMA node +closest to which page allocation will come from. Specifying the home node override +the default allocation policy to allocate memory close to the local node for an +executing CPU. + + Memory Policy Command Line Interface ==================================== diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 8a55d0dd3c88..7d37309bd456 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -51,6 +51,7 @@ struct mempolicy { nodemask_t nodes; /* interleave/bind */ /* undefined for default */ } v; + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 337c0e57075b..ef78d11a70bf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -349,6 +349,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } @@ -1525,6 +1526,77 @@ static long kernel_mbind(unsigned long start, unsigned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1967,6 +2039,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) } } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + return nd; } @@ -2439,6 +2516,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; -- Gitee From 1e3451e075c8d91e3be398e395b9cfc9cf487d6d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 27 Apr 2023 19:13:49 +0800 Subject: [PATCH 09/13] mm/mempolicy: wire up syscall set_mempolicy_home_node mainline inclusion from mainline-v5.17-rc1 commit 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 -------------------------------- Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ma Wupeng --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 2 +- arch/arm64/include/asm/unistd.h | 1 - arch/arm64/include/asm/unistd32.h | 4 ++-- arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 4 ++-- kernel/sys_ni.c | 1 + tools/include/uapi/asm-generic/unistd.h | 4 ++-- 22 files changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index dc18bb368186..6e7d777ca2ad 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -483,3 +483,4 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +560 common set_mempolicy_home_node sys_ni_syscall diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 2a5d0253a7cb..735b12e65c69 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -463,7 +463,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 39ce73c8084d..60e89a6ba3bd 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -42,7 +42,6 @@ #define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE + 2) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) - #define __NR_compat_syscalls 457 #endif diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 3a278103afd9..10d1b21f09d2 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -908,8 +908,8 @@ __SYSCALL(__NR_kabi_reserved447, sys_ni_syscall) __SYSCALL(__NR_kabi_reserved448, sys_ni_syscall) #define __NR_kabi_reserved449 449 __SYSCALL(__NR_kabi_reserved449, sys_ni_syscall) -#define __NR_kabi_reserved450 450 -__SYSCALL(__NR_kabi_reserved450, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_kabi_reserved451 451 __SYSCALL(__NR_kabi_reserved451, sys_ni_syscall) #define __NR_kabi_reserved452 452 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index d5b5e5d1bd39..d6eba311cb3d 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -364,3 +364,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index bdf6263478f5..5ac522662231 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 5ad55e42b655..8b95ed1df39a 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -449,3 +449,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 11e7fe97b995..6d2913ecc903 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -382,3 +382,4 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index b09a3a8b3f77..215d92ddeb0d 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -358,3 +358,4 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 263b4e428764..fea634250d2d 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -431,3 +431,4 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index d699b3a3c728..df9294268449 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -441,3 +441,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f48c87f72d9d..2a7c804b00b8 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -539,7 +539,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index c40d1e5a904b..4830c583b2be 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -446,3 +446,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 162485313465..1c10af5c6dc7 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -446,3 +446,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index bf6c45e30a66..bda8abd0878e 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -489,3 +489,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c89fdfb18c8b..c71dc25932b6 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -454,7 +454,7 @@ 447 i386 kabi_reserved447 sys_ni_syscall 448 i386 kabi_reserved448 sys_ni_syscall 449 i386 kabi_reserved449 sys_ni_syscall -450 i386 kabi_reserved450 sys_ni_syscall +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 kabi_reserved451 sys_ni_syscall 452 i386 kabi_reserved452 sys_ni_syscall 453 i386 kabi_reserved453 sys_ni_syscall diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5775a88d1d02..a4f8fe52639b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,7 +371,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 9b083a7b4e23..e1c88f068aa1 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -414,3 +414,4 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5a9192eae603..5e3ca1c49f3f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1039,6 +1039,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 72f9235614dc..064f512fefd9 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -878,8 +878,8 @@ __SYSCALL(__NR_kabi_reserved447, sys_ni_syscall) __SYSCALL(__NR_kabi_reserved448, sys_ni_syscall) #define __NR_kabi_reserved449 449 __SYSCALL(__NR_kabi_reserved449, sys_ni_syscall) -#define __NR_kabi_reserved450 450 -__SYSCALL(__NR_kabi_reserved450, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_kabi_reserved451 451 __SYSCALL(__NR_kabi_reserved451, sys_ni_syscall) #define __NR_kabi_reserved452 452 diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 218dc39a4e32..d264ee7e3d87 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -298,6 +298,7 @@ COND_SYSCALL(migrate_pages); COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL_COMPAT(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index ddd5d28b5b7f..c32cfcc11e61 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -878,8 +878,8 @@ __SYSCALL(__NR_kabi_reserved447, sys_ni_syscall) __SYSCALL(__NR_kabi_reserved448, sys_ni_syscall) #define __NR_kabi_reserved449 449 __SYSCALL(__NR_kabi_reserved449, sys_ni_syscall) -#define __NR_kabi_reserved450 450 -__SYSCALL(__NR_kabi_reserved450, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_kabi_reserved451 451 __SYSCALL(__NR_kabi_reserved451, sys_ni_syscall) #define __NR_kabi_reserved452 452 -- Gitee From 94e5cb7eda65f27b7373994b353fef191d6afcf5 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 27 Apr 2023 19:13:50 +0800 Subject: [PATCH 10/13] mm/hugetlb: add dedicated func to get 'allowed' nodemask for current process mainline inclusion from mainline-v6.1-rc1 commit d2226ebd5484afcf9f9b71b394ec1567a7730eb1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d2226ebd5484afcf9f9b71b394ec1567a7730eb1 -------------------------------- Muchun Song found that after MPOL_PREFERRED_MANY policy was introduced in commit b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes"), the policy_nodemask_current()'s semantics for this new policy has been changed, which returns 'preferred' nodes instead of 'allowed' nodes. With the changed semantic of policy_nodemask_current, a task with MPOL_PREFERRED_MANY policy could fail to get its reservation even though it can fall back to other nodes (either defined by cpusets or all online nodes) for that reservation failing mmap calles unnecessarily early. The fix is to not consider MPOL_PREFERRED_MANY for reservations at all because they, unlike MPOL_MBIND, do not pose any actual hard constrain. Michal suggested the policy_nodemask_current() is only used by hugetlb, and could be moved to hugetlb code with more explicit name to enforce the 'allowed' semantics for which only MPOL_BIND policy matters. apply_policy_zone() is made extern to be called in hugetlb code and its return value is changed to bool. [1]. https://lore.kernel.org/lkml/20220801084207.39086-1-songmuchun@bytedance.com/t/ Link: https://lkml.kernel.org/r/20220805005903.95563-1-feng.tang@intel.com Fixes: b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes") Signed-off-by: Feng Tang Reported-by: Muchun Song Suggested-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: Muchun Song Cc: Mike Kravetz Cc: Dave Hansen Cc: Ben Widawsky Signed-off-by: Andrew Morton Conflicts: include/linux/mempolicy.h mm/hugetlb.c Signed-off-by: Ma Wupeng --- include/linux/mempolicy.h | 15 +++------------ mm/hugetlb.c | 26 +++++++++++++++++++++----- mm/mempolicy.c | 2 +- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 7d37309bd456..3341d198a34f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -155,13 +155,6 @@ extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask); extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - struct mempolicy *mpol = get_task_policy(current); - - return policy_nodemask(gfp, mpol); -} - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -210,6 +203,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } + +extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); + #else struct mempolicy {}; @@ -321,11 +317,6 @@ static inline void mpol_put_task_policy(struct task_struct *task) { } -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - return NULL; -} - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 45c227c9e4c8..caab12d3eb95 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3853,19 +3853,35 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->v.nodes))) + return &mpol->v.nodes; +#endif + return NULL; +} + static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; - nodemask_t *mpol_allowed; + nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); - mpol_allowed = policy_nodemask_current(gfp_mask); - + mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || - (mpol_allowed && node_isset(node, *mpol_allowed))) + if (!mbind_nodemask || + (mbind_nodemask && node_isset(node, *mbind_nodemask))) nr += array[node]; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ef78d11a70bf..330c4f95aece 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1971,7 +1971,7 @@ bool vma_policy_mof(struct vm_area_struct *vma) return pol->flags & MPOL_F_MOF; } -static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; -- Gitee From 7d438edbee57589fb4237053afe3eb48a31173fb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 27 Apr 2023 19:13:51 +0800 Subject: [PATCH 11/13] tools headers UAPI: Sync files changed by new set_mempolicy_home_node syscall mainline inclusion from mainline-v5.17-rc1 commit 6e10e21915c1ab6eaa145f7b5ebaf4500af1b011 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6e10e21915c1ab6eaa145f7b5ebaf4500af1b011 -------------------------------- To pick the changes in these csets: 21b084fdf2a49ca1 ("mm/mempolicy: wire up syscall set_mempolicy_home_node") That add support for this new syscall in tools such as 'perf trace'. For instance, this is now possible: [root@five ~]# perf trace -e set_mempolicy_home_node ^C[root@five ~]# [root@five ~]# perf trace -v -e set_mempolicy_home_node Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 253729 && common_pid != 3585) && (id == 450) mmap size 528384B ^C[root@five ~] [root@five ~]# perf trace -v -e set* --max-events 5 Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 253734 && common_pid != 3585) && (id == 38 || id == 54 || id == 105 || id == 106 || id == 109 || id == 112 || id == 113 || id == 114 || id == 116 || id == 117 || id == 119 || id == 122 || id == 123 || id == 141 || id == 160 || id == 164 || id == 170 || id == 171 || id == 188 || id == 205 || id == 218 || id == 238 || id == 273 || id == 308 || id == 450) mmap size 528384B 0.000 ( 0.008 ms): bash/253735 setpgid(pid: 253735 (bash), pgid: 253735 (bash)) = 0 6849.011 ( 0.008 ms): bash/16046 setpgid(pid: 253736 (bash), pgid: 253736 (bash)) = 0 6849.080 ( 0.005 ms): bash/253736 setpgid(pid: 253736 (bash), pgid: 253736 (bash)) = 0 7437.718 ( 0.009 ms): gnome-shell/253737 set_robust_list(head: 0x7f34b527e920, len: 24) = 0 13445.986 ( 0.010 ms): bash/16046 setpgid(pid: 253738 (bash), pgid: 253738 (bash)) = 0 [root@five ~]# That is the filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ find tools/perf/arch/ -name "syscall*tbl" | xargs grep -w set_mempolicy_home_node tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/powerpc/entry/syscalls/syscall.tbl:450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/s390/entry/syscalls/syscall.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/x86/entry/syscalls/syscall_64.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node $ $ grep -w set_mempolicy_home_node /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c [450] = "set_mempolicy_home_node", $ This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Aneesh Kumar K.V Cc: Linus Torvalds Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ma Wupeng --- tools/include/uapi/asm-generic/unistd.h | 3 +++ tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index c32cfcc11e61..7b3a3c84cac1 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -893,6 +893,9 @@ __SYSCALL(__NR_kabi_reserved455, sys_ni_syscall) #define __NR_kabi_reserved456 456 __SYSCALL(__NR_kabi_reserved456, sys_ni_syscall) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls #define __NR_syscalls 457 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index c68c1111c155..af58772e41b7 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -539,7 +539,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index d2fa9647ce25..fe015307e163 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -388,3 +388,4 @@ 378 common s390_guarded_storage sys_s390_guarded_storage compat_sys_s390_guarded_storage 379 common statx sys_statx compat_sys_statx 380 common s390_sthyi sys_s390_sthyi compat_sys_s390_sthyi +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node \ No newline at end of file diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 8f4ad1695d8f..e52491d39345 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,7 +371,7 @@ 447 common kabi_reserved447 sys_ni_syscall 448 common kabi_reserved448 sys_ni_syscall 449 common kabi_reserved449 sys_ni_syscall -450 common kabi_reserved450 sys_ni_syscall +450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common kabi_reserved451 sys_ni_syscall 452 common kabi_reserved452 sys_ni_syscall 453 common kabi_reserved453 sys_ni_syscall -- Gitee From 334e66dad3847e59f97785b86615c1defabdfcd0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 27 Apr 2023 19:13:52 +0800 Subject: [PATCH 12/13] mm/mempolicy: fix memory leak in set_mempolicy_home_node system call mainline inclusion from mainline-v6.2-rc1 commit 38ce7c9bdfc228c14d7621ba36d3eebedd9d4f76 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=38ce7c9bdfc228c14d7621ba36d3eebedd9d4f76 -------------------------------- When encountering any vma in the range with policy other than MPOL_BIND or MPOL_PREFERRED_MANY, an error is returned without issuing a mpol_put on the policy just allocated with mpol_dup(). This allows arbitrary users to leak kernel memory. Link: https://lkml.kernel.org/r/20221215194621.202816-1-mathieu.desnoyers@efficios.com Fixes: c6018b4b2549 ("mm/mempolicy: add set_mempolicy_home_node syscall") Signed-off-by: Mathieu Desnoyers Reviewed-by: Randy Dunlap Reviewed-by: "Huang, Ying" Reviewed-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: [5.17+] Signed-off-by: Andrew Morton Signed-off-by: Ma Wupeng --- mm/mempolicy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 330c4f95aece..c837d56b466e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1583,6 +1583,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le * the home node for vmas we already updated before. */ if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + mpol_put(new); err = -EOPNOTSUPP; break; } -- Gitee From 160c26bcd3bd210068ef07868573d56527fd0e13 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Thu, 27 Apr 2023 19:13:53 +0800 Subject: [PATCH 13/13] mm: Use wrapper to fix KABI broken in struct mempolicy hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6I1Z2 CVE: NA -------------------------------- Since struct mempolicy is commonly used by external users, use wrapper to fix KABI broken in struct mempolicy. Signed-off-by: Ma Wupeng --- include/linux/mempolicy.h | 6 +++- mm/mempolicy.c | 62 ++++++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3341d198a34f..d4920e4a3e38 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -51,13 +51,17 @@ struct mempolicy { nodemask_t nodes; /* interleave/bind */ /* undefined for default */ } v; - int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; +struct mempolicy_wrapper { + struct mempolicy policy; + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ +}; + /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c837d56b466e..b58ec3f98896 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -122,12 +122,14 @@ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ -static struct mempolicy default_policy = { - .refcnt = ATOMIC_INIT(1), /* never free it */ - .mode = MPOL_LOCAL, +static struct mempolicy_wrapper default_policy = { + .policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .mode = MPOL_LOCAL, + } }; -static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +static struct mempolicy_wrapper preferred_node_policy[MAX_NUMNODES]; /** * numa_map_to_online_node - Find closest online node @@ -165,13 +167,13 @@ struct mempolicy *get_task_policy(struct task_struct *p) node = numa_node_id(); if (node != NUMA_NO_NODE) { - pol = &preferred_node_policy[node]; + pol = &preferred_node_policy[node].policy; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } - return &default_policy; + return &default_policy.policy; } static const struct mempolicy_operations { @@ -311,6 +313,7 @@ static int mpol_set_nodemask(struct mempolicy *pol, static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { + struct mempolicy_wrapper *wrapper; struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", @@ -346,10 +349,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); + wrapper = container_of(policy, struct mempolicy_wrapper, policy); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; - policy->home_node = NUMA_NO_NODE; + wrapper->home_node = NUMA_NO_NODE; return policy; } @@ -933,7 +937,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (p == &default_policy.policy) return; switch (p->mode) { @@ -1013,7 +1017,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return -EINVAL; if (!pol) - pol = &default_policy; /* indicates default behavior */ + pol = &default_policy.policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { @@ -1038,7 +1042,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, goto out; } } else { - *policy = pol == &default_policy ? MPOL_DEFAULT : + *policy = pol == &default_policy.policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing @@ -1530,6 +1534,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; + struct mempolicy_wrapper *wrapper; struct vm_area_struct *vma; struct mempolicy *new; unsigned long vmstart; @@ -1588,7 +1593,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le break; } - new->home_node = home_node; + wrapper = container_of(new, struct mempolicy_wrapper, policy); + wrapper->home_node = home_node; err = mbind_range(mm, vmstart, vmend, new); mpol_put(new); if (err) @@ -2022,6 +2028,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { + struct mempolicy_wrapper *warpper; + + warpper = container_of(policy, struct mempolicy_wrapper, policy); if (policy->mode == MPOL_PREFERRED) { nd = policy->v.preferred_node; } else { @@ -2042,8 +2051,8 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) if ((policy->mode == MPOL_BIND || policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; + warpper->home_node != NUMA_NO_NODE) + return warpper->home_node; return nd; } @@ -2439,7 +2448,7 @@ EXPORT_SYMBOL(alloc_pages_vma); */ struct page *alloc_pages(gfp_t gfp, unsigned order) { - struct mempolicy *pol = &default_policy; + struct mempolicy *pol = &default_policy.policy; struct page *page; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) @@ -2488,17 +2497,22 @@ int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + struct mempolicy_wrapper *old_wrapper, *new_wrapper; if (!new) return ERR_PTR(-ENOMEM); + old_wrapper = container_of(old, struct mempolicy_wrapper, policy); + new_wrapper = container_of(new, struct mempolicy_wrapper, policy); + /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); - *new = *old; + *new_wrapper = *old_wrapper; task_unlock(current); - } else - *new = *old; + } else { + *new_wrapper = *old_wrapper; + } if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); @@ -2511,13 +2525,18 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { + struct mempolicy_wrapper *wrapper_a, *wrapper_b; + + wrapper_a = container_of(a, struct mempolicy_wrapper, policy); + wrapper_b = container_of(b, struct mempolicy_wrapper, policy); + if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; - if (a->home_node != b->home_node) + if (wrapper_a->home_node != wrapper_b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) @@ -2981,7 +3000,7 @@ void __init numa_policy_init(void) int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", - sizeof(struct mempolicy), + sizeof(struct mempolicy_wrapper), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", @@ -2989,7 +3008,7 @@ void __init numa_policy_init(void) 0, SLAB_PANIC, NULL); for_each_node(nid) { - preferred_node_policy[nid] = (struct mempolicy) { + preferred_node_policy[nid].policy = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, @@ -3202,7 +3221,8 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + if (pol && pol != &default_policy.policy && + !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } -- Gitee