diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8c199fe368c2e7849e061d2872173161cdce0232..016b9b334c217c38f24ce9568515e6aa3fc711e0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1172,8 +1172,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } -void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_ratelimited(void); +void mem_cgroup_flush_stats(struct mem_cgroup *memcg); +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1691,11 +1691,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void mem_cgroup_flush_stats(void) +static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } -static inline void mem_cgroup_flush_stats_ratelimited(void) +static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fff8b93225219c6b5e99e6b114516f0aa3e02e02..3dbc90681c5ecfa7df624b4df32964f070454b96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -82,6 +82,7 @@ #include #include +#include struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); @@ -600,116 +601,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -/* - * memcg and lruvec stats flushing - * - * Many codepaths leading to stats update or read are performance sensitive and - * adding stats flushing in such codepaths is not desirable. So, to optimize the - * flushing the kernel does: - * - * 1) Periodically and asynchronously flush the stats every 2 seconds to not let - * rstat update tree grow unbounded. - * - * 2) Flush the stats synchronously on reader side only when there are more than - * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization - * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but - * only for 2 seconds due to (1). - */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_PER_CPU(unsigned int, stats_updates); -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); -static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static u64 flush_next_time; - -#define FLUSH_TIME (2UL*HZ) - -/* - * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can - * not rely on this as part of an acquired spinlock_t lock. These functions are - * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion - * is sufficient. - */ -static void memcg_stats_lock(void) -{ - preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); -} - -static void __memcg_stats_lock(void) -{ - preempt_disable_nested(); -} - -static void memcg_stats_unlock(void) -{ - preempt_enable_nested(); -} - -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) -{ - unsigned int x; - - if (!val) - return; - - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - - x = __this_cpu_add_return(stats_updates, abs(val)); - if (x > MEMCG_CHARGE_BATCH) { - /* - * If stats_flush_threshold exceeds the threshold - * (>num_online_cpus()), cgroup stats update will be triggered - * in __mem_cgroup_flush_stats(). Increasing this var further - * is redundant and simply adds overhead in atomic update. - */ - if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) - atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); - __this_cpu_write(stats_updates, 0); - } -} - -static void do_flush_stats(void) -{ - /* - * We always flush the entire tree, so concurrent flushers can just - * skip. This avoids a thundering herd problem on the rstat global lock - * from memcg flushers (e.g. reclaim, refault, etc). - */ - if (atomic_read(&stats_flush_ongoing) || - atomic_xchg(&stats_flush_ongoing, 1)) - return; - - WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); - - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); - - atomic_set(&stats_flush_threshold, 0); - atomic_set(&stats_flush_ongoing, 0); -} - -void mem_cgroup_flush_stats(void) -{ - if (atomic_read(&stats_flush_threshold) > num_online_cpus()) - do_flush_stats(); -} - -void mem_cgroup_flush_stats_ratelimited(void) -{ - if (time_after64(jiffies_64, READ_ONCE(flush_next_time))) - mem_cgroup_flush_stats(); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - /* - * Always flush here so that flushing in latency-sensitive paths is - * as cheap as possible. - */ - do_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); -} - /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { PGPGIN, @@ -767,6 +658,9 @@ struct memcg_vmstats_percpu { /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; + + /* Stats updates since the last flush */ + unsigned int stats_updates; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -789,6 +683,9 @@ struct memcg_vmstats { /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; + + /* Stats updates since the last flush */ + atomic64_t stats_updates; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -799,6 +696,129 @@ struct memcg_vmstats { KABI_RESERVE(8) }; +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static u64 flush_last_time; + +#define FLUSH_TIME (2UL*HZ) + +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); +} + +static void __memcg_stats_lock(void) +{ + preempt_disable_nested(); +} + +static void memcg_stats_unlock(void) +{ + preempt_enable_nested(); +} + + +static bool memcg_should_flush_stats(struct mem_cgroup *memcg) +{ + return atomic64_read(&memcg->vmstats->stats_updates) > + MEMCG_CHARGE_BATCH * num_online_cpus(); +} + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +{ + int cpu = smp_processor_id(); + unsigned int x; + + if (!val) + return; + + cgroup_rstat_updated(memcg->css.cgroup, cpu); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, + abs(val)); + + if (x < MEMCG_CHARGE_BATCH) + continue; + + /* + * If @memcg is already flush-able, increasing stats_updates is + * redundant. Avoid the overhead of the atomic update. + */ + if (!memcg_should_flush_stats(memcg)) + atomic64_add(x, &memcg->vmstats->stats_updates); + __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); + } +} + +static void do_flush_stats(struct mem_cgroup *memcg) +{ + if (mem_cgroup_is_root(memcg)) + WRITE_ONCE(flush_last_time, jiffies_64); + + cgroup_rstat_flush(memcg->css.cgroup); +} + +/* + * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree + * @memcg: root of the subtree to flush + * + * Flushing is serialized by the underlying global rstat lock. There is also a + * minimum amount of work to be done even if there are no stat updates to flush. + * Hence, we only flush the stats if the updates delta exceeds a threshold. This + * avoids unnecessary work and contention on the underlying lock. + */ +void mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return; + + if (!memcg) + memcg = root_mem_cgroup; + + if (memcg_should_flush_stats(memcg)) + do_flush_stats(memcg); +} + +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) +{ + /* Only flush if the periodic flusher is one full cycle late */ + if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME)) + mem_cgroup_flush_stats(memcg); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + /* + * Deliberately ignore memcg_should_flush_stats() here so that flushing + * in latency-sensitive paths is as cheap as possible. + */ + do_flush_stats(root_mem_cgroup); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); +} + unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { long x = READ_ONCE(memcg->vmstats->state[idx]); @@ -1649,7 +1669,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) * * Current memory state: */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -4802,7 +4822,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -4883,7 +4903,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -5391,7 +5411,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -6864,6 +6884,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } } } + statc->stats_updates = 0; + /* We are in a per-cpu loop here, only do the atomic write once */ + if (atomic64_read(&memcg->vmstats->stats_updates)) + atomic64_set(&memcg->vmstats->stats_updates, 0); } #ifdef CONFIG_MMU @@ -7922,7 +7946,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -9279,7 +9303,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) break; } - cgroup_rstat_flush(memcg->css.cgroup); + /* + * mem_cgroup_flush_stats() ignores small changes. Use + * do_flush_stats() directly to get accurate stats for charging. + */ + do_flush_stats(memcg); pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; if (pages < max) continue; @@ -9344,8 +9372,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - cgroup_rstat_flush(css->cgroup); - return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_flush_stats(memcg); + return memcg_page_state(memcg, MEMCG_ZSWAP_B); } static int zswap_max_show(struct seq_file *m, void *v) diff --git a/mm/vmscan.c b/mm/vmscan.c index 34614bb7062dbf5df61248085b5bdef2cd6c78ff..3f08468c46ed1564d167bb22974aa6a3d4065a86 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2949,7 +2949,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) * Flush the memory cgroup stats, so that we read accurate per-memcg * lruvec stats for heuristics. */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. diff --git a/mm/workingset.c b/mm/workingset.c index 9110957bec5b30ec41b51b722c1d63f452c50dc3..7bac9be1b87f7b1d6ef13095d67345b3386e89a4 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -425,8 +425,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) struct pglist_data *pgdat; unsigned long eviction; - if (lru_gen_enabled()) - return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); + rcu_read_lock(); + + if (lru_gen_enabled()) { + bool recent = lru_gen_test_recent(shadow, file, + &eviction_lruvec, &eviction, workingset); + + rcu_read_unlock(); + return recent; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -448,8 +456,20 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !eviction_memcg) + if (!mem_cgroup_disabled() && + (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { + rcu_read_unlock(); return false; + } + + rcu_read_unlock(); + + /* + * Flush stats (and potentially sleep) outside the RCU read section. + * XXX: With per-memcg flushing and thresholding, is ratelimiting + * still needed here? + */ + mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -493,6 +513,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } } + mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } @@ -519,19 +540,16 @@ void workingset_refault(struct folio *folio, void *shadow) return; } - /* Flush stats (and potentially sleep) before holding RCU read lock */ - mem_cgroup_flush_stats_ratelimited(); - - rcu_read_lock(); - /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. + * is actually experiencing the refault event. Make sure the folio is + * locked to guarantee folio_memcg() stability throughout. */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); @@ -540,7 +558,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset)) - goto out; + return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -556,8 +574,6 @@ void workingset_refault(struct folio *folio, void *shadow) lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } -out: - rcu_read_unlock(); } /** @@ -664,7 +680,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - mem_cgroup_flush_stats_ratelimited(); + mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec,