diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 66ae60dead2eec355f1f3c58a6ebc4330ae2bd89..66836f91f7747f5b894e0d75bf01c4d833f47841 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -83,6 +83,10 @@ Brief summary of control files. This knob is deprecated and shouldn't be used. memory.force_empty trigger forced page reclaim + memory.force_swapin trigger forced swapin anon page + memory.swap.max set/show limit for the difference between memsw.usage + and memory.usage + memory.swapfile set/show available swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b26b5274eaaf140ed8ccb617df2eca53a166e8bd..84cbbeaf0d78c176e79721d4cdc2afb08dcc5afb 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1247,15 +1247,17 @@ PAGE_SIZE multiple when read back. target cgroup. This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). + This file also accepts nested keys, the number of bytes to reclaim + with the type of memory to reclaim. + + Example:: + echo "1G type=file" > memory.reclaim + echo "1G type=anon" > memory.reclaim Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 0f0d9b3d6367c69ce1eb195697cd17ca5dd548d0..9260a93adead80b1f943c0882533248f6a9dfdc6 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -161,6 +161,7 @@ CONFIG_MEMCG=y CONFIG_MEMCG_V1_RECLAIM=y CONFIG_MEMCG_MEMFS_INFO=y CONFIG_MEMCG_OOM_PRIORITY=y +CONFIG_MEMCG_SWAP_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 504de984c7be769c71bdbdbe28e2321e7e16512e..7e893f18129e87e41ee6e5c283c2a0e3bb31b88d 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -182,6 +182,7 @@ CONFIG_MEMCG=y CONFIG_MEMCG_V1_RECLAIM=y CONFIG_MEMCG_MEMFS_INFO=y CONFIG_MEMCG_OOM_PRIORITY=y +CONFIG_MEMCG_SWAP_QOS=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 31aff8b9286a014cd0ec4a06389c0b14e368d21d..287d130ee9690d903fcd09de8fa5b43ae2430937 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,6 +54,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -201,6 +206,11 @@ struct obj_cgroup { }; }; +struct swap_device { + unsigned long max; + int type; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -350,6 +360,10 @@ struct mem_cgroup { bool high_async_reclaim; #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + struct swap_device *swap_dev; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; @@ -367,6 +381,15 @@ static inline void memcg_print_bad_task(struct oom_control *oc) } #endif +#ifdef CONFIG_MEMCG_SWAP_QOS +DECLARE_STATIC_KEY_FALSE(memcg_swap_qos_key); + +#define MEMCG_SWAP_STAT_DISABLE 0 +#define MEMCG_SWAP_STAT_ALL 1 +#define MEMCG_SWAP_STAT_NONE 2 +#define MAX_MEMCG_SWAP_TYPE MEMCG_SWAP_STAT_NONE +#endif + /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the @@ -1191,6 +1214,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +int memcg_get_swap_type(struct folio *folio); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1627,6 +1653,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void memcg_print_bad_task(struct oom_control *oc) { } + +static inline int memcg_get_swap_type(struct folio *folio) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bacc4da324c38cadaa3cc9034071b5687cc694..f078aa6b493cf57ae1e7a8237163b8aa59ac3bfd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3313,6 +3313,10 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); +#ifdef CONFIG_MEMCG_SWAP_QOS +extern void force_swapin_vma(struct vm_area_struct *vma); +#endif + #ifdef CONFIG_MMU extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/include/linux/swap.h b/include/linux/swap.h index fe20c462fecb0fbfdd6c461222ca7a3ea5aa70bd..9dc160d6fd43b892a86516d008c1f72474108422 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -419,6 +419,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MEMCG_RECLAIM_NOT_FILE (1 << 3) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, @@ -487,7 +488,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -515,6 +517,14 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->users); } +#ifdef CONFIG_MEMCG_SWAP_QOS +extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); +extern long get_nr_swap_pages_type(int type); +void enable_swap_slots_cache_max(void); +#endif + #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a01ce324c1b5e809c01db591774c2..77521ac11dca8116cd5e710e5eea96dd76b50a59 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); +void enable_swap_slots_cache(int type); void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/init/Kconfig b/init/Kconfig index fad38e3594287a06e556fada10d315804b71eb9e..7e4d606b134a97af365fc15a8a3b9398c78bb5ab 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -973,6 +973,15 @@ config MEMCG_OOM_PRIORITY If unsure, say "n". +config MEMCG_SWAP_QOS + bool "Enable Memory Cgroup Swap Control" + depends on MEMCG && SWAP + depends on X86 || ARM64 + default n + help + memcg swap control include memory force swapin, swapfile control + and swap limit. + config MEMCG_KMEM bool depends on MEMCG diff --git a/mm/madvise.c b/mm/madvise.c index 4dded5d27e7eaa7c109663059d6077211335e542..2d56815daff246c2c43494fb940a0249f21a7a46 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -185,6 +185,11 @@ static int madvise_update_vma(struct vm_area_struct *vma, return 0; } +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); +} + #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) @@ -273,6 +278,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma, } #endif /* CONFIG_SWAP */ +#ifdef CONFIG_MEMCG_SWAP_QOS +void force_swapin_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (!can_madv_lru_vma(vma)) + return; + + if (!file) { + walk_page_vma(vma, &swapin_walk_ops, vma); + lru_add_drain(); + } else if (shmem_mapping(file->f_mapping)) + shmem_swapin_range(vma, vma->vm_start, + vma->vm_end, file->f_mapping); +} +#else +void force_swapin_vma(struct vm_area_struct *vma) +{ +} +#endif + /* * Schedule all required I/O operations. Do not wait for completion. */ @@ -555,11 +581,6 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } -static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); -} - static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e80504a49c044585caf9c80c4a6c9cd78e541d7..1b79803e4b0ce95df243974850cee1971d028cfd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -65,6 +65,12 @@ #include #include #include +#include + +#ifdef CONFIG_MEMCG_SWAP_QOS +#include +#endif + #include "internal.h" #include #include @@ -4202,6 +4208,7 @@ void memcg_print_bad_task(struct oom_control *oc) } } +#ifdef CONFIG_SYSCTL static void memcg_oom_prio_reset(void) { struct mem_cgroup *iter; @@ -4231,34 +4238,344 @@ static int sysctl_memcg_oom_prio_handler(struct ctl_table *table, int write, return ret; } +#endif +#endif -static struct ctl_table memcg_oom_prio_sysctls[] = { - { - /* - * This sysctl is used to control memcg oom priority - * feature, the sysctl name is for compatibility. - */ - .procname = "memcg_qos_enable", - .data = &sysctl_memcg_oom_prio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_memcg_oom_prio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; +#ifdef CONFIG_MEMCG_SWAP_QOS +DEFINE_STATIC_KEY_FALSE(memcg_swap_qos_key); + +#ifdef CONFIG_SYSCTL +static int sysctl_memcg_swap_qos_stat; +static int swap_qos_type_max = MAX_MEMCG_SWAP_TYPE; + +static void memcg_swap_qos_reset(int type) +{ + struct mem_cgroup *memcg; + + for_each_mem_cgroup(memcg) { + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, type); + } +} + +static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + int qos_stat_old = sysctl_memcg_swap_qos_stat; + int swap_type; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (write) { + if (qos_stat_old == sysctl_memcg_swap_qos_stat) + return 0; + + switch (sysctl_memcg_swap_qos_stat) { + case MEMCG_SWAP_STAT_DISABLE: + static_branch_disable(&memcg_swap_qos_key); + return 0; + case MEMCG_SWAP_STAT_ALL: + swap_type = SWAP_TYPE_ALL; + break; + case MEMCG_SWAP_STAT_NONE: + swap_type = SWAP_TYPE_NONE; + break; + } + + if (qos_stat_old == MEMCG_SWAP_STAT_DISABLE) { + memcg_swap_qos_reset(swap_type); + static_branch_enable(&memcg_swap_qos_key); + enable_swap_slots_cache_max(); + } else { + return -EINVAL; + } + } + + return 0; +} +#endif -static __init int memcg_oom_prio_sysctls_init(void) +static int mem_cgroup_task_swapin(struct task_struct *task, void *arg) { - register_sysctl_init("vm", memcg_oom_prio_sysctls); + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + struct blk_plug plug; + VMA_ITERATOR(vmi, mm, 0); + + if (__task_is_dying(task)) + return 0; + if (!mm || !mmget_not_zero(mm)) + return 0; + + mmap_read_lock(mm); + blk_start_plug(&plug); + for_each_vma(vmi, vma) + force_swapin_vma(vma); + blk_finish_plug(&plug); + mmap_read_unlock(mm); + mmput(mm); + + return 0; +} + +static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + mem_cgroup_scan_tasks(memcg, mem_cgroup_task_swapin, NULL); + + return nbytes; +} + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; return 0; } + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + if (!static_branch_likely(&memcg_swap_qos_key) || !parent) { + WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX); + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + } else { + WRITE_ONCE(memcg->swap_dev->max, + READ_ONCE(parent->swap_dev->max)); + WRITE_ONCE(memcg->swap_dev->type, + READ_ONCE(parent->swap_dev->type)); + } +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (!static_branch_likely(&memcg_swap_qos_key)) + return PAGE_COUNTER_MAX * PAGE_SIZE; + + return READ_ONCE(memcg->swap_dev->max) * PAGE_SIZE; +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + WRITE_ONCE(memcg->swap_dev->max, max); + + return nbytes; +} + +static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return 0; + + if (!entry.val) + return 0; + + rcu_read_lock(); + target_memcg = folio_memcg(folio); + if (!target_memcg || mem_cgroup_is_root(target_memcg) || + !css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->max); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (folio_nr_pages(folio) > nr_swap_pages) + return -ENOMEM; + + return 0; +} + +static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) { + seq_printf(m, "all\n"); + return 0; + } + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, type); + return 0; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return -EACCES; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); + return nbytes; + } else if (!strcmp(buf, "all")) { + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL); + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +int memcg_get_swap_type(struct folio *folio) +{ + struct mem_cgroup *memcg; + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return SWAP_TYPE_ALL; + + if (!folio) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + memcg = folio_memcg(folio); + if (!memcg || mem_cgroup_is_root(memcg) || + !css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + for_each_mem_cgroup(memcg) + if (READ_ONCE(memcg->swap_dev->type) == type) + WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE); +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + int type; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return mem_cgroup_get_nr_swap_pages(memcg); + + type = READ_ONCE(memcg->swap_dev->type); + if (type == SWAP_TYPE_ALL) + return mem_cgroup_get_nr_swap_pages(memcg); + else if (type == SWAP_TYPE_NONE) + return 0; + else + return get_nr_swap_pages_type(type); +} + #else -static inline int memcg_oom_prio_sysctls_init(void) +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) { return 0; } +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_swap_device_init(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +static int mem_cgroup_check_swap_for_v1(struct folio *folio, swp_entry_t entry) +{ + return 0; +} + +int memcg_get_swap_type(struct folio *folio) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} + +static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg) +{ + return mem_cgroup_get_nr_swap_pages(memcg); +} + #endif #ifdef CONFIG_NUMA @@ -5750,6 +6067,25 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memory_ksm_write, .seq_show = memory_ksm_show, }, +#endif +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .name = "force_swapin", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memory_swapin, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, #endif { }, /* terminate */ }; @@ -5886,6 +6222,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_node_info(memcg, node); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); } @@ -5907,6 +6244,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error); + if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { @@ -5990,12 +6330,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_swap_device_init(memcg, parent); } else { init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_swap_device_init(memcg, NULL); root_mem_cgroup = memcg; return &memcg->css; @@ -7308,6 +7650,62 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_TYPE = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t tokens = { + { MEMORY_RECLAIM_TYPE, "type=%s"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +#define RECLAIM_TYPE_SIZE 8 + +static int reclaim_param_parse(char *buf, unsigned long *nr_pages, + unsigned int *reclaim_options) +{ + char *old_buf, *start; + char type[RECLAIM_TYPE_SIZE]; + substring_t args[MAX_OPT_ARGS]; + u64 bytes; + + buf = strstrip(buf); + if (!strcmp(buf, "")) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + old_buf = buf; + bytes = memparse(buf, &buf); + if (buf == old_buf) + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + buf = strstrip(buf); + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_TYPE: + match_strlcpy(type, &args[0], RECLAIM_TYPE_SIZE); + if (!strcmp(type, "anon")) + *reclaim_options |= MEMCG_RECLAIM_NOT_FILE; + else if (!strcmp(type, "file")) + *reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; + else + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + return 0; +} + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -7317,18 +7715,22 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, unsigned int reclaim_options; int err; - buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + err = reclaim_param_parse(buf, &nr_to_reclaim, &reclaim_options); if (err) return err; - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; if (signal_pending(current)) return -EINTR; + /* If only reclaim swap pages, check swap space at first. */ + if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) && + (mem_cgroup_get_nr_swap_pages_type(memcg) <= 0)) + return -EAGAIN; + /* * This is the final attempt, drain percpu lru caches in the * hope of introducing more evictable pages for @@ -7970,6 +8372,51 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) refill_stock(memcg, nr_pages); } +#ifdef CONFIG_SYSCTL +static struct ctl_table mem_cgroup_sysctls[] = { +#ifdef CONFIG_MEMCG_OOM_PRIORITY + { + /* + * This sysctl is used to control memcg oom priority + * feature, the sysctl name is for compatibility. + */ + .procname = "memcg_qos_enable", + .data = &sysctl_memcg_oom_prio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_oom_prio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .procname = "memcg_swap_qos_enable", + .data = &sysctl_memcg_swap_qos_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_memcg_swap_qos_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &swap_qos_type_max, + }, +#endif +}; + +static __init int mem_cgroup_sysctls_init(void) +{ + if (mem_cgroup_disabled()) + return 0; + + register_sysctl_init("vm", mem_cgroup_sysctls); + return 0; +} +#else +static __init int mem_cgroup_sysctls_init(void) +{ + return 0; +} +#endif + static int __init cgroup_memory(char *s) { char *token; @@ -8029,7 +8476,7 @@ static int __init mem_cgroup_init(void) } mem_cgroup_memfs_info_init(); - memcg_oom_prio_sysctls_init(); + mem_cgroup_sysctls_init(); return 0; } @@ -8139,7 +8586,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned short oldid; if (do_memsw_account()) - return 0; + return mem_cgroup_check_swap_for_v1(folio, entry); memcg = folio_memcg(folio); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0bec1f705f8e09313e1fcdcf87568cd5bf68da38..c7781364fa50ff41b392e6af26913f3e60eaf65e 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -36,6 +36,11 @@ #include static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +#ifdef CONFIG_MEMCG_SWAP_QOS +static unsigned int nr_swap_slots; +static unsigned int max_swap_slots; +static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots); +#endif static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; @@ -110,7 +115,37 @@ static bool check_cache_active(void) return swap_slot_cache_active; } -static int alloc_swap_slot_cache(unsigned int cpu) +#ifdef CONFIG_MEMCG_SWAP_QOS +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return raw_cpu_ptr(&swp_slots); + else + return raw_cpu_ptr(&swp_type_slots[swap_type]); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return &per_cpu(swp_slots, cpu); + else + return &per_cpu(swp_type_slots, cpu)[swap_type]; +} +#else +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + return raw_cpu_ptr(&swp_slots); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + return &per_cpu(swp_slots, cpu); +} +#endif + +static int alloc_swap_slot_cache_cpu_type(unsigned int cpu, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots, *slots_ret; @@ -133,7 +168,7 @@ static int alloc_swap_slot_cache(unsigned int cpu) } mutex_lock(&swap_slots_cache_mutex); - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if (cache->slots || cache->slots_ret) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); @@ -165,13 +200,74 @@ static int alloc_swap_slot_cache(unsigned int cpu) return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +#ifdef CONFIG_MEMCG_SWAP_QOS +static int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + int i, ret; + + ret = alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); + if (ret) + return ret; + + for (i = 0; i < nr_swap_slots; i++) { + ret = alloc_swap_slot_cache_cpu_type(cpu, i); + if (ret) + return ret; + } + + return ret; +} + +static void alloc_swap_slot_cache_type(int type) +{ + unsigned int cpu; + + if (type >= max_swap_slots) + max_swap_slots = type + 1; + + if (!static_branch_likely(&memcg_swap_qos_key)) + return; + + /* serialize with cpu hotplug operations */ + cpus_read_lock(); + while (type >= nr_swap_slots) { + for_each_online_cpu(cpu) + alloc_swap_slot_cache_cpu_type(cpu, nr_swap_slots); + nr_swap_slots++; + } + cpus_read_unlock(); +} + +void enable_swap_slots_cache_max(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + if (max_swap_slots) + alloc_swap_slot_cache_type(max_swap_slots - 1); + mutex_unlock(&swap_slots_cache_enable_mutex); +} +#else +static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); +} + +static void alloc_swap_slot_cache_type(int type) +{ +} +#endif + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + return __alloc_swap_slot_cache_cpu(cpu); +} + +static void drain_slots_cache_cpu_type(unsigned int cpu, unsigned int type, + bool free_slots, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots = NULL; - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if ((type & SLOTS_CACHE) && cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); @@ -196,6 +292,30 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } } +#ifdef CONFIG_MEMCG_SWAP_QOS +static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + int i; + + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); + for (i = 0; i < nr_swap_slots; i++) + drain_slots_cache_cpu_type(cpu, type, free_slots, i); +} +#else +static inline void __drain_slots_cache_cpu(unsigned int cpu, + unsigned int type, bool free_slots) +{ + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); +} +#endif + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + __drain_slots_cache_cpu(cpu, type, free_slots); +} + static void __drain_swap_slots_cache(unsigned int type) { unsigned int cpu; @@ -235,7 +355,7 @@ static int free_slot_cache(unsigned int cpu) return 0; } -void enable_swap_slots_cache(void) +void enable_swap_slots_cache(int type) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -250,13 +370,14 @@ void enable_swap_slots_cache(void) swap_slot_cache_initialized = true; } + alloc_swap_slot_cache_type(type); __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); } /* called with swap slot cache's alloc lock held */ -static int refill_swap_slots_cache(struct swap_slots_cache *cache) +static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type) { if (!use_swap_slot_cache) return 0; @@ -264,7 +385,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, type); return cache->nr; } @@ -303,12 +424,18 @@ swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; struct swap_slots_cache *cache; + int type; entry.val = 0; + type = memcg_get_swap_type(folio); + if (type == SWAP_TYPE_NONE) + goto out; + + if (folio_test_large(folio)) { if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) - get_swap_pages(1, &entry, folio_nr_pages(folio)); + get_swap_pages(1, &entry, folio_nr_pages(folio), type); goto out; } @@ -321,7 +448,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) * The alloc path here does not touch cache->slots_ret * so cache->free_lock is not taken. */ - cache = raw_cpu_ptr(&swp_slots); + cache = get_slots_cache(type); if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); @@ -331,7 +458,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) entry = cache->slots[cache->cur]; cache->slots[cache->cur++].val = 0; cache->nr--; - } else if (refill_swap_slots_cache(cache)) { + } else if (refill_swap_slots_cache(cache, type)) { goto repeat; } } @@ -340,7 +467,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) goto out; } - get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4bc70f459164147e260eeb3b669737b0df1a2b82..68859289f19e38795c9df1cb5dfd8fed7036c583 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1044,7 +1044,92 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) swap_range_free(si, offset, SWAPFILE_CLUSTER); } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + WRITE_ONCE(*swap_type, type); + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +long get_nr_swap_pages_type(int type) +{ + struct swap_info_struct *si; + long nr_swap_pages = 0; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + nr_swap_pages = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + + return nr_swap_pages; +} + +static long get_avail_pages(unsigned long size, int type) +{ + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + avail_pgs = get_nr_swap_pages_type(type) / size; + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1057,7 +1142,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) spin_lock(&swap_avail_lock); - avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1074,6 +1159,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2514,6 +2604,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); @@ -3234,7 +3325,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (inode) inode_unlock(inode); if (!error) - enable_swap_slots_cache(); + enable_swap_slots_cache(p->type); return error; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7a676296af30666d31ba5e0efbf069d600e20e3d..6461552c81d78c65451b5c10f0b76d3a795e6301 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -109,6 +109,9 @@ struct scan_control { /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Should skip file pages? */ + unsigned int not_file:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ unsigned int proactive:1; @@ -3035,6 +3038,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; + if (sc->not_file) { + scan_balance = SCAN_ANON; + goto out; + } + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; @@ -7141,6 +7149,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .not_file = !!(reclaim_options & MEMCG_RECLAIM_NOT_FILE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put