diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 6859a50fbd0971877663228ceef1ea832b2773fa..3891916a06711c89422ae76af0b170d842261ee6 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -78,6 +78,9 @@ Brief summary of control files. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled memory.force_empty trigger forced page reclaim + memory.force_swapin trigger forced swapin anon page + memory.swap.max set/show limit for swap + memory.swapfile set/show available swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5d9b7e552fb0e2112eebc55e94f71e0c75a09bb9..394fb18c9e3dc1cb9ebdaddf77dd8a6ad9395068 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1196,20 +1196,28 @@ PAGE_SIZE multiple when read back. target cgroup. This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). + This file also accepts nested keys, the number of bytes to reclaim + with the type of memory to reclaim. + + Example:: + echo "1G type=file" > memory.reclaim Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. + Please note that the proactive reclaim (triggered by this + interface) is not meant to indicate memory pressure on the + memory cgroup. Therefore socket memory balancing triggered by + the memory reclaim normally is not exercised in this case. + This means that the networking layer will not adapt based on + reclaim induced by memory.reclaim. + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e1c24b0056d01d427db0e44f5f71acd78d25f9f6..16b3666393f264bdcf8d5c2d2f6d954864e7e5d9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -240,6 +245,11 @@ struct obj_cgroup { }; }; +struct swap_device { + unsigned long max; + int type; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -402,7 +412,12 @@ struct mem_cgroup { #else KABI_RESERVE(6) #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + /* per-memcg swap device control; protected by swap_lock */ + KABI_USE(7, struct swap_device *swap_dev) +#else KABI_RESERVE(7) +#endif KABI_RESERVE(8) struct mem_cgroup_per_node *nodeinfo[0]; @@ -424,6 +439,10 @@ extern int sysctl_memcg_qos_handler(struct ctl_table *table, void memcg_print_bad_task(struct oom_control *oc); #endif +#ifdef CONFIG_MEMCG_SWAP_QOS +DECLARE_STATIC_KEY_FALSE(memcg_swap_qos_key); +#endif + /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. @@ -1294,6 +1313,9 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) int mem_cgroup_force_empty(struct mem_cgroup *memcg); +int memcg_get_swap_type(struct page *page); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1701,6 +1723,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void memcg_print_bad_task(struct oom_control *oc) { } + +static inline int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5ce84212d787c334684d3dfec042ac5ce7c301..58d7a59b5b6529fae7a3d49ceba3a2ee9628a934 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2650,6 +2650,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern void force_swapin_vma(struct vm_area_struct *vma); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); extern unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, diff --git a/include/linux/swap.h b/include/linux/swap.h index 7f49964f27d2d714269be06e9ae52205f1ba6bfb..524e3ca9d0f5b6dd244d3beaa82e940d44cfaeee 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -376,10 +376,14 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode); + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MEMCG_RECLAIM_NOT_FILE (1 << 3) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -507,11 +511,14 @@ static inline long get_nr_swap_pages(void) return atomic_long_read(&nr_swap_pages); } +extern long get_nr_swap_pages_type(int type); + extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -543,6 +550,12 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->sei->users); } +#ifdef CONFIG_MEMCG_SWAP_QOS +extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); +void enable_swap_slots_cache_max(void); +#endif #else /* CONFIG_SWAP */ static inline int swap_readpage(struct page *page, bool do_poll) diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 347f1a30419059a577157e3a2fd68979db294dfd..1ea720390822bd78217cbeff9a484a2aa37d7487 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); +void enable_swap_slots_cache(int type); int free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/mm/Kconfig b/mm/Kconfig index f66457168de968e7ca81d649efc461c2ce7357eb..0fe459e793368edb1d108d5fa2f22326a90b71ea 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -512,6 +512,15 @@ config MEMCG_QOS If unsure, say "n". +config MEMCG_SWAP_QOS + bool "Enable Memory Cgroup Swap Control" + depends on MEMCG_SWAP + depends on X86 || ARM64 + default n + help + memcg swap control include memory force swapin, swapfile control + and swap limit. + config ETMEM_SCAN tristate "module: etmem page scan for etmem support" depends on ETMEM diff --git a/mm/madvise.c b/mm/madvise.c index 0a1d6f9d75eaa2d1c5eb47d73f82111276512fce..6028383a8147c510e99ad652f0bd9ec1848362d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -259,6 +259,25 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, lru_add_drain(); /* Push any new pages onto the LRU now */ } + +void force_swapin_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (!can_madv_lru_vma(vma)) + return; + + if (!file) { + walk_page_vma(vma, &swapin_walk_ops, vma); + lru_add_drain(); + } else if (shmem_mapping(file->f_mapping)) + force_shm_swapin_readahead(vma, vma->vm_start, + vma->vm_end, file->f_mapping); +} +#else +void force_swapin_vma(struct vm_area_struct *vma) +{ +} #endif /* CONFIG_SWAP */ /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 58c16f15984cc3998fd65094dee79aaf796f215b..dfe0f11271535c13aac5dd53679e4c50c9345d81 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2397,7 +2397,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, - gfp_mask, true); + gfp_mask, + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2660,7 +2661,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, enum oom_status oom_status; unsigned long nr_reclaimed; bool passed_oom = false; - bool may_swap = true; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; bool drained = false; unsigned long pflags; @@ -2679,7 +2680,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); - may_swap = false; + reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; } if (batch > nr_pages) { @@ -2715,7 +2716,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, may_swap); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3365,8 +3366,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, continue; } - if (!try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, !memsw)) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3483,7 +3484,7 @@ int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; progress = try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, true); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4054,6 +4055,344 @@ void memcg_print_bad_task(struct oom_control *oc) #endif +#ifdef CONFIG_MEMCG_SWAP_QOS +DEFINE_STATIC_KEY_FALSE(memcg_swap_qos_key); + +#ifdef CONFIG_SYSCTL +static int sysctl_memcg_swap_qos_stat; + +static void memcg_swap_qos_reset(void) +{ + struct mem_cgroup *memcg; + + for_each_mem_cgroup(memcg) { + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } +} + +static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + if (write) { + if (sysctl_memcg_swap_qos_stat) { + memcg_swap_qos_reset(); + static_branch_enable(&memcg_swap_qos_key); + enable_swap_slots_cache_max(); + } else { + static_branch_disable(&memcg_swap_qos_key); + } + } + return 0; +} + +static struct ctl_table memcg_swap_qos_sysctls[] = { + { + .procname = "memcg_swap_qos_enable", + .data = &sysctl_memcg_swap_qos_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_swap_qos_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int memcg_swap_qos_sysctls_init(void) +{ + if (mem_cgroup_disabled() || cgroup_memory_noswap) + return 0; + register_sysctl_init("vm", memcg_swap_qos_sysctls); + return 0; +} +late_initcall(memcg_swap_qos_sysctls_init); +#endif + +static int mem_cgroup_task_swapin(struct task_struct *task, void *arg) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + struct blk_plug plug; + + mmap_read_lock(mm); + blk_start_plug(&plug); + for (vma = mm->mmap; vma; vma = vma->vm_next) + force_swapin_vma(vma); + blk_finish_plug(&plug); + mmap_read_unlock(mm); + + return 0; +} + +static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + mem_cgroup_scan_tasks(memcg, mem_cgroup_task_swapin, NULL); + + return nbytes; +} + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) { + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } else { + WRITE_ONCE(memcg->swap_dev->max, + READ_ONCE(parent->swap_dev->max)); + WRITE_ONCE(memcg->swap_dev->type, + READ_ONCE(parent->swap_dev->type)); + } +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (!static_branch_likely(&memcg_swap_qos_key)) + return PAGE_COUNTER_MAX * PAGE_SIZE; + + return READ_ONCE(memcg->swap_dev->max) * PAGE_SIZE; +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + WRITE_ONCE(memcg->swap_dev->max, max); + + return nbytes; +} + +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return 0; + + if (!entry.val) + return 0; + + rcu_read_lock(); + target_memcg = page_memcg(page); + if (!target_memcg || mem_cgroup_is_root(target_memcg)) { + rcu_read_unlock(); + return 0; + } + + if (!css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->max); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (thp_nr_pages(page) > nr_swap_pages) + return -ENOMEM; + return 0; +} + +static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) { + seq_printf(m, "all\n"); + return 0; + } + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, type); + return 0; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); + return nbytes; + } else if (!strcmp(buf, "all")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +int memcg_get_swap_type(struct page *page) +{ + struct mem_cgroup *memcg; + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return SWAP_TYPE_ALL; + + if (!page) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + memcg = page_memcg(page); + if (!memcg || mem_cgroup_is_root(memcg)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + for_each_mem_cgroup(memcg) + if (READ_ONCE(memcg->swap_dev->type) == type) + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return mem_cgroup_get_nr_swap_pages(memcg); + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_ALL) + return mem_cgroup_get_nr_swap_pages(memcg); + else if (type == SWAP_TYPE_NONE) + return 0; + else + return get_nr_swap_pages_type(type); +} + +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + return 0; +} + +int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + return mem_cgroup_get_nr_swap_pages(memcg); +} + +#endif + #ifdef CONFIG_NUMA #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) @@ -5230,7 +5569,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -5265,16 +5604,47 @@ static int memcg_events_local_show(struct seq_file *m, void *v) return 0; } +static int reclaim_param_parse(char *buf, unsigned long *nr_pages, + unsigned int *reclaim_options) +{ + char *endp; + u64 bytes; + + if (!strcmp(buf, "")) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &endp); + if (*endp == ' ') { + buf = endp + 1; + buf = strim(buf); + if (!strcmp(buf, "type=anon")) + *reclaim_options |= MEMCG_RECLAIM_NOT_FILE; + else if (!strcmp(buf, "type=file")) + *reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; + else + return -EINVAL; + } else if (*endp != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int reclaim_options; int err; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); + err = reclaim_param_parse(buf, &nr_to_reclaim, &reclaim_options); if (err) return err; @@ -5288,6 +5658,11 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (signal_pending(current)) return -EINTR; + /* If only reclaim swap pages, check swap space at first. */ + if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) && + (mem_cgroup_get_nr_swap_pages_type(memcg) <= 0)) + return -EAGAIN; + /* This is the final attempt, drain percpu lru caches in the * hope of introducing more evictable pages for * try_to_free_mem_cgroup_pages(). @@ -5297,7 +5672,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; @@ -5710,6 +6085,25 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "reclaim", .write = memory_reclaim, }, +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .name = "force_swapin", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memory_swapin, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, +#endif { .name = "high_async_ratio", .flags = CFTYPE_NOT_ON_ROOT, @@ -5854,6 +6248,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); } @@ -5878,6 +6273,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error); + if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); @@ -5955,17 +6353,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_swap_device_init(memcg, NULL); } else if (parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_swap_device_init(memcg, parent); } else { page_counter_init(&memcg->memory, &root_mem_cgroup->memory); page_counter_init(&memcg->swap, &root_mem_cgroup->swap); page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); + memcg_swap_device_init(memcg, root_mem_cgroup); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -6984,7 +7385,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, true)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -7899,7 +8300,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) unsigned short oldid; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return 0; + return mem_cgroup_check_swap_for_v1(page, entry); memcg = page_memcg(page); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe706454754bfa09867d90dc4f1ecbe5a2e..fbb9888a0469a48b19d706654479eed59893b1f8 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -35,6 +35,11 @@ #include static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +#ifdef CONFIG_MEMCG_SWAP_QOS +static unsigned int nr_swap_slots; +static unsigned int max_swap_slots; +static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots); +#endif static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; @@ -111,7 +116,37 @@ static bool check_cache_active(void) return swap_slot_cache_active; } -static int alloc_swap_slot_cache(unsigned int cpu) +#ifdef CONFIG_MEMCG_SWAP_QOS +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return raw_cpu_ptr(&swp_slots); + else + return raw_cpu_ptr(&swp_type_slots)[swap_type]; +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return &per_cpu(swp_slots, cpu); + else + return &per_cpu(swp_type_slots, cpu)[swap_type]; +} +#else +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + return raw_cpu_ptr(&swp_slots); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + return &per_cpu(swp_slots, cpu); +} +#endif + +static int alloc_swap_slot_cache_cpu_type(unsigned int cpu, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots, *slots_ret; @@ -134,7 +169,7 @@ static int alloc_swap_slot_cache(unsigned int cpu) } mutex_lock(&swap_slots_cache_mutex); - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if (cache->slots || cache->slots_ret) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); @@ -166,13 +201,74 @@ static int alloc_swap_slot_cache(unsigned int cpu) return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +#ifdef CONFIG_MEMCG_SWAP_QOS +static int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + int i, ret; + + ret = alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); + if (ret) + return ret; + + for (i = 0; i < nr_swap_slots; i++) { + ret = alloc_swap_slot_cache_cpu_type(cpu, i); + if (ret) + return ret; + } + + return ret; +} + +static void alloc_swap_slot_cache_type(int type) +{ + unsigned int cpu; + + if (type >= max_swap_slots) + max_swap_slots = type + 1; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + /* serialize with cpu hotplug operations */ + get_online_cpus(); + while (type >= nr_swap_slots) { + for_each_online_cpu(cpu) + alloc_swap_slot_cache_cpu_type(cpu, nr_swap_slots); + nr_swap_slots++; + } + put_online_cpus(); +} + +void enable_swap_slots_cache_max(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + if (max_swap_slots) + alloc_swap_slot_cache_type(max_swap_slots - 1); + mutex_unlock(&swap_slots_cache_enable_mutex); +} +#else +static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); +} + +static void alloc_swap_slot_cache_type(int type) +{ +} +#endif + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + return __alloc_swap_slot_cache_cpu(cpu); +} + +static void drain_slots_cache_cpu_type(unsigned int cpu, unsigned int type, + bool free_slots, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots = NULL; - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if ((type & SLOTS_CACHE) && cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); @@ -198,6 +294,30 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } } +#ifdef CONFIG_MEMCG_SWAP_QOS +static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + int i; + + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); + for (i = 0; i < nr_swap_slots; i++) + drain_slots_cache_cpu_type(cpu, type, free_slots, i); +} +#else +static inline void __drain_slots_cache_cpu(unsigned int cpu, + unsigned int type, bool free_slots) +{ + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); +} +#endif + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + __drain_slots_cache_cpu(cpu, type, free_slots); +} + static void __drain_swap_slots_cache(unsigned int type) { unsigned int cpu; @@ -237,7 +357,7 @@ static int free_slot_cache(unsigned int cpu) return 0; } -void enable_swap_slots_cache(void) +void enable_swap_slots_cache(int type) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -251,14 +371,14 @@ void enable_swap_slots_cache(void) swap_slot_cache_initialized = true; } - + alloc_swap_slot_cache_type(type); __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); } /* called with swap slot cache's alloc lock held */ -static int refill_swap_slots_cache(struct swap_slots_cache *cache) +static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type) { if (!use_swap_slot_cache || cache->nr) return 0; @@ -266,7 +386,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, type); return cache->nr; } @@ -307,12 +427,17 @@ swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; struct swap_slots_cache *cache; + int type; entry.val = 0; + type = memcg_get_swap_type(page); + if (type == SWAP_TYPE_NONE) + goto out; + if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, HPAGE_PMD_NR); + get_swap_pages(1, &entry, HPAGE_PMD_NR, type); goto out; } @@ -325,7 +450,7 @@ swp_entry_t get_swap_page(struct page *page) * The alloc path here does not touch cache->slots_ret * so cache->free_lock is not taken. */ - cache = raw_cpu_ptr(&swp_slots); + cache = get_slots_cache(type); if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); @@ -335,7 +460,7 @@ swp_entry_t get_swap_page(struct page *page) entry = cache->slots[cache->cur]; cache->slots[cache->cur++].val = 0; cache->nr--; - } else if (refill_swap_slots_cache(cache)) { + } else if (refill_swap_slots_cache(cache, type)) { goto repeat; } } @@ -344,7 +469,7 @@ swp_entry_t get_swap_page(struct page *page) goto out; } - get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e2396fa8a31aa99db03cd7e21386c974e605a6..f5a15511f45ad6f3f7e57baf82874d67e9cb2e92 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1056,7 +1056,97 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + WRITE_ONCE(*swap_type, type); + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +long get_nr_swap_pages_type(int type) +{ + struct swap_info_struct *si; + long nr_swap_pages = 0; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + nr_swap_pages = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + + return nr_swap_pages; +} + +static long get_avail_pages(unsigned long size, int type) +{ + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + avail_pgs = get_nr_swap_pages_type(type) / size; + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +long get_nr_swap_pages_type(int type) +{ + return 0; +} + +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1069,7 +1159,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) spin_lock(&swap_avail_lock); - avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1086,6 +1176,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2703,6 +2798,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) cluster_info = p->cluster_info; p->cluster_info = NULL; frontswap_map = frontswap_map_get(p); + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); @@ -3457,7 +3553,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (inode) inode_unlock(inode); if (!error) - enable_swap_slots_cache(); + enable_swap_slots_cache(p->type); return error; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a8412c5d4edace62f24231fcdd56a3f212ea91c1..6416737f5206bca026d9298e0d4ac9d7ffedb8b1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -103,6 +103,12 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Should skip file pages? */ + unsigned int not_file:1; + + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at @@ -2461,6 +2467,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; + if (sc->not_file) { + scan_balance = SCAN_ANON; + goto out; + } + /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; @@ -2880,9 +2891,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->priority); /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } @@ -3005,9 +3017,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -3252,8 +3265,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, - sc->priority); + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); @@ -3562,7 +3576,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3575,7 +3589,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = may_swap, + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .not_file = !!(reclaim_options & MEMCG_RECLAIM_NOT_FILE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put