diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 649fbb5c1adc5c5ffcb06f7b548e8f83146be48c..53e66ca6fd5a20427f3a56a234ba212319f0bd36 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -317,7 +317,13 @@ struct mem_cgroup { #endif #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; - struct obj_cgroup __rcu *objcg; + /* + * memcg->objcg is wiped out as a part of the objcg repaprenting + * process. memcg->orig_objcg preserves a pointer (and a reference) + * to the original objcg until the end of live of memcg. + */ + struct obj_cgroup __rcu *objcg; + struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; #endif @@ -1855,9 +1861,27 @@ bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); -struct obj_cgroup *get_obj_cgroup_from_current(void); +/* + * The returned objcg pointer is safe to use without additional + * protection within a scope. The scope is defined either by + * the current task (similar to the "current" global variable) + * or by set_active_memcg() pair. + * Please, use obj_cgroup_get() to get a reference if the pointer + * needs to be used outside of the local scope. + */ +struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); +static inline struct obj_cgroup *get_obj_cgroup_from_current(void) +{ + struct obj_cgroup *objcg = current_obj_cgroup(); + + if (objcg) + obj_cgroup_get(objcg); + + return objcg; +} + int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f18d450561881bfbc193b3be9e18d5f8ab17137..ce6741061708ee39516fce943791efd71a53c4e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1454,6 +1454,10 @@ struct task_struct { struct mem_cgroup *active_memcg; #endif +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *objcg; +#endif + #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8d89c8c4fac1f2db1fc278478486aa71516b52bd..9a19f1b42f64129936dd763e1f9436536e60d47e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * + * Please, make sure that caller has a reference to the passed memcg structure, + * so its lifetime is guaranteed to exceed the scope between two + * set_active_memcg() calls. + * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 346be829229476e96bf7b41452213203364d1b6a..da80b7887a318c5e6e709dbdf411d25aed0bdf94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -275,6 +275,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) return container_of(vmpr, struct mem_cgroup, vmpressure); } +#define CURRENT_OBJCG_UPDATE_BIT 0 +#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) + #ifdef CONFIG_MEMCG_KMEM static DEFINE_SPINLOCK(objcg_lock); @@ -1097,19 +1100,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -static __always_inline bool memcg_kmem_bypass(void) -{ - /* Allow remote memcg charging from any context. */ - if (unlikely(active_memcg())) - return false; - - /* Memcg to charge can't be determined. */ - if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) - return true; - - return false; -} - /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -3112,28 +3102,105 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { objcg = rcu_dereference(memcg->objcg); - if (objcg && obj_cgroup_tryget(objcg)) + if (likely(objcg && obj_cgroup_tryget(objcg))) break; objcg = NULL; } return objcg; } -__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) +static struct obj_cgroup *current_objcg_update(void) { - struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg; + struct obj_cgroup *old, *objcg = NULL; - if (memcg_kmem_bypass()) - return NULL; + do { + /* Atomically drop the update bit. */ + old = xchg(¤t->objcg, NULL); + if (old) { + old = (struct obj_cgroup *) + ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); + if (old) + obj_cgroup_put(old); + + old = NULL; + } - rcu_read_lock(); - if (unlikely(active_memcg())) - memcg = active_memcg(); - else + /* If new objcg is NULL, no reason for the second atomic update. */ + if (!current->mm || (current->flags & PF_KTHREAD)) + return NULL; + + /* + * Release the objcg pointer from the previous iteration, + * if try_cmpxcg() below fails. + */ + if (unlikely(objcg)) { + obj_cgroup_put(objcg); + objcg = NULL; + } + + /* + * Obtain the new objcg pointer. The current task can be + * asynchronously moved to another memcg and the previous + * memcg can be offlined. So let's get the memcg pointer + * and try get a reference to objcg under a rcu read lock. + */ + + rcu_read_lock(); memcg = mem_cgroup_from_task(current); - objcg = __get_obj_cgroup_from_memcg(memcg); - rcu_read_unlock(); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + + /* + * Try set up a new objcg pointer atomically. If it + * fails, it means the update flag was set concurrently, so + * the whole procedure should be repeated. + */ + } while (!try_cmpxchg(¤t->objcg, &old, objcg)); + + return objcg; +} + +__always_inline struct obj_cgroup *current_obj_cgroup(void) +{ + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; + + if (in_task()) { + memcg = current->active_memcg; + if (unlikely(memcg)) + goto from_memcg; + + objcg = READ_ONCE(current->objcg); + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) + objcg = current_objcg_update(); + /* + * Objcg reference is kept by the task, so it's safe + * to use the objcg by the current task. + */ + return objcg; + } + + memcg = this_cpu_read(int_active_memcg); + if (unlikely(memcg)) + goto from_memcg; + + return NULL; + +from_memcg: + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + /* + * Memcg pointer is protected by scope (see set_active_memcg()) + * and is pinning the corresponding objcg, so objcg can't go + * away and can be used within the scope without any additional + * protection. + */ + objcg = rcu_dereference_check(memcg->objcg, 1); + if (likely(objcg)) + break; + objcg = NULL; + } + return objcg; } @@ -3231,15 +3298,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct obj_cgroup *objcg; int ret = 0; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (objcg) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { + obj_cgroup_get(objcg); page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - obj_cgroup_put(objcg); } return ret; } @@ -3883,6 +3950,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); + obj_cgroup_get(objcg); + memcg->orig_objcg = objcg; static_branch_enable(&memcg_kmem_online_key); @@ -6336,6 +6405,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + if (memcg->orig_objcg) + obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); kfree(memcg->vmstats); @@ -7452,6 +7524,7 @@ static void mem_cgroup_move_task(void) mem_cgroup_clear_mc(); } } + #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { @@ -7465,8 +7538,39 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_MEMCG_KMEM +static void mem_cgroup_fork(struct task_struct *task) +{ + /* + * Set the update flag to cause task->objcg to be initialized lazily + * on the first allocation. It can be done without any synchronization + * because it's always performed on the current task, so does + * current_objcg_update(). + */ + task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG; +} + +static void mem_cgroup_exit(struct task_struct *task) +{ + struct obj_cgroup *objcg = task->objcg; + + objcg = (struct obj_cgroup *) + ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); + if (objcg) + obj_cgroup_put(objcg); + + /* + * Some kernel allocations can happen after this point, + * but let's ignore them. It can be done without any synchronization + * because it's always performed on the current task, so does + * current_objcg_update(). + */ + task->objcg = NULL; +} +#endif + #ifdef CONFIG_LRU_GEN -static void mem_cgroup_attach(struct cgroup_taskset *tset) +static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; @@ -7484,10 +7588,31 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset) task_unlock(task); } #else +static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_MEMCG_KMEM +static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + /* atomically set the update bit */ + set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); + } +} +#else +static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {} +#endif /* CONFIG_MEMCG_KMEM */ + +#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) static void mem_cgroup_attach(struct cgroup_taskset *tset) { + mem_cgroup_lru_gen_attach(tset); + mem_cgroup_kmem_attach(tset); } -#endif /* CONFIG_LRU_GEN */ +#endif static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { @@ -7956,9 +8081,15 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, +#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) .attach = mem_cgroup_attach, +#endif .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, +#ifdef CONFIG_MEMCG_KMEM + .fork = mem_cgroup_fork, + .exit = mem_cgroup_exit, +#endif .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, diff --git a/mm/percpu.c b/mm/percpu.c index a7665de8485fd9594c523c68ad579b1303e1c4ba..f53ba692d67a6deb8a3f2bed750fef71e4af2e8b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1628,14 +1628,12 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { - obj_cgroup_put(objcg); + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; - } *objcgp = objcg; return true; @@ -1649,6 +1647,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, return; if (likely(chunk && chunk->obj_cgroups)) { + obj_cgroup_get(objcg); chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; rcu_read_lock(); @@ -1657,7 +1656,6 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); - obj_cgroup_put(objcg); } } diff --git a/mm/slab.h b/mm/slab.h index 799a315695c6791c860f4b464c54552053e8e05a..3d07fb428393fe14a3adf7cc75940bd0c58431ee 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -484,7 +484,12 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); if (!objcg) return true; @@ -497,17 +502,14 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, css_put(&memcg->css); if (ret) - goto out; + return false; } if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - goto out; + return false; *objcgp = objcg; return true; -out: - obj_cgroup_put(objcg); - return false; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, @@ -542,7 +544,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_uncharge(objcg, obj_full_size(s)); } } - obj_cgroup_put(objcg); } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,