diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 6859a50fbd0971877663228ceef1ea832b2773fa..97bad1406a23e246f1c5e04db8072477fce9cf7b 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -78,6 +78,8 @@ Brief summary of control files. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled memory.force_empty trigger forced page reclaim + memory.force_swapin trigger forced swapin anon page + memory.swapfile set/show swap file memory.pressure_level set memory pressure notifications memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5d9b7e552fb0e2112eebc55e94f71e0c75a09bb9..a04ca490f58c81b58945ac13e2ffc34503f148e9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1196,15 +1196,16 @@ PAGE_SIZE multiple when read back. target cgroup. This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). + This file also accepts nested keys, the number of bytes to reclaim + with the type of memory to reclaim. + + Example:: + echo "1G type=file" > memory.reclaim Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4395f2e03cb7ee1b2cc7d4d438ce3b51f74c8018..c1619a0448031857f67a35ec8ff8582f156953a4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,11 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum { + SWAP_TYPE_ALL = -1, /* allowd use all swap file */ + SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */ +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; @@ -240,6 +245,11 @@ struct obj_cgroup { }; }; +struct swap_device { + int type; + unsigned long limit; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -402,7 +412,12 @@ struct mem_cgroup { #else KABI_RESERVE(6) #endif +#ifdef CONFIG_MEMCG_SWAP_QOS + /* per-memcg swap device control; protected by swap_lock */ + KABI_USE(7, struct swap_device *swap_dev) +#else KABI_RESERVE(7) +#endif KABI_RESERVE(8) struct mem_cgroup_per_node *nodeinfo[0]; @@ -1292,6 +1307,9 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) int mem_cgroup_force_empty(struct mem_cgroup *memcg); +int memcg_get_swap_type(struct page *page); +void memcg_remove_swapfile(int type); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1695,6 +1713,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +static inline void memcg_remove_swapfile(int type) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5ce84212d787c334684d3dfec042ac5ce7c301..58d7a59b5b6529fae7a3d49ceba3a2ee9628a934 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2650,6 +2650,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern void force_swapin_vma(struct vm_area_struct *vma); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); extern unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, diff --git a/include/linux/swap.h b/include/linux/swap.h index 7f49964f27d2d714269be06e9ae52205f1ba6bfb..b40cc0500b4280508e96fe98bea1a24c83a81465 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -380,6 +380,10 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap); +extern unsigned long __try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + bool may_swap, bool only_swap); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -511,7 +515,8 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, + int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -543,6 +548,9 @@ static inline void put_swap_device(struct swap_info_struct *si) percpu_ref_put(&si->sei->users); } +extern int write_swapfile_for_memcg(struct address_space *mapping, + int *swap_type); +extern void read_swapfile_for_memcg(struct seq_file *m, int type); #else /* CONFIG_SWAP */ static inline int swap_readpage(struct page *page, bool do_poll) diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 347f1a30419059a577157e3a2fd68979db294dfd..1ea720390822bd78217cbeff9a484a2aa37d7487 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); +void enable_swap_slots_cache(int type); int free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/mm/Kconfig b/mm/Kconfig index f66457168de968e7ca81d649efc461c2ce7357eb..c43c2e6b744f7c2d3af0b8cbb5afb7b8db37a730 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -512,6 +512,14 @@ config MEMCG_QOS If unsure, say "n". +config MEMCG_SWAP_QOS + bool "Enable Memory Cgroup Swap priority" + depends on MEMCG_SWAP + depends on X86 || ARM64 + default n + help + Support swapin memory for memcg. Support swapfile limit for memcg. + config ETMEM_SCAN tristate "module: etmem page scan for etmem support" depends on ETMEM diff --git a/mm/madvise.c b/mm/madvise.c index 0a1d6f9d75eaa2d1c5eb47d73f82111276512fce..6028383a8147c510e99ad652f0bd9ec1848362d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -259,6 +259,25 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, lru_add_drain(); /* Push any new pages onto the LRU now */ } + +void force_swapin_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (!can_madv_lru_vma(vma)) + return; + + if (!file) { + walk_page_vma(vma, &swapin_walk_ops, vma); + lru_add_drain(); + } else if (shmem_mapping(file->f_mapping)) + force_shm_swapin_readahead(vma, vma->vm_start, + vma->vm_end, file->f_mapping); +} +#else +void force_swapin_vma(struct vm_area_struct *vma) +{ +} #endif /* CONFIG_SWAP */ /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 662c7859b7f152784b347776852ee9a144607329..4ee3f0fd8fc1fa6c870626641cf885707cfffe01 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5263,16 +5263,46 @@ static int memcg_events_local_show(struct seq_file *m, void *v) return 0; } +static int reclaim_param_parse(char *buf, unsigned long *nr_pages, + bool *anon, bool *file) +{ + char *endp; + u64 bytes; + + if (!strcmp(buf, "")) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &endp); + if (*endp == ' ') { + buf = endp + 1; + buf = strim(buf); + if (!strcmp(buf, "type=anon")) + *file = false; + else if (!strcmp(buf, "type=file")) + *anon = false; + else + return -EINVAL; + } else if (*endp != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + bool anon = true, file = true; int err; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); + err = reclaim_param_parse(buf, &nr_to_reclaim, &anon, &file); if (err) return err; @@ -5293,9 +5323,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (!nr_retries) lru_add_drain_all(); - reclaimed = try_to_free_mem_cgroup_pages(memcg, + reclaimed = __try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, anon, !file); if (!reclaimed && !nr_retries--) return -EAGAIN; @@ -5306,6 +5336,197 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return nbytes; } +#ifdef CONFIG_MEMCG_SWAP_QOS +static int mem_cgroup_task_swapin(struct task_struct *task, void *arg) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + struct blk_plug plug; + + mmap_read_lock(mm); + blk_start_plug(&plug); + for (vma = mm->mmap; vma; vma = vma->vm_next) + force_swapin_vma(vma); + blk_finish_plug(&plug); + mmap_read_unlock(mm); + + return 0; +} + +static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (unlikely(mem_cgroup_is_root(memcg))) + return -EINVAL; + + mem_cgroup_scan_tasks(memcg, mem_cgroup_task_swapin, NULL); + + return nbytes; +} + +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL); + if (!memcg->swap_dev) + return -ENOMEM; + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ + if (!memcg->swap_dev) + return; + + kfree(memcg->swap_dev); + memcg->swap_dev = NULL; +} + +static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct filename *pathname; + struct file *swapfile; + int ret; + + buf = strstrip(buf); + + if (!strcmp(buf, "none")) { + memcg->swap_dev->type = SWAP_TYPE_NONE; + return nbytes; + } else if (!strcmp(buf, "all")) { + memcg->swap_dev->type = SWAP_TYPE_ALL; + return nbytes; + } + + pathname = getname_kernel(buf); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swapfile)) { + putname(pathname); + return PTR_ERR(swapfile); + } + ret = write_swapfile_for_memcg(swapfile->f_mapping, + &memcg->swap_dev->type); + filp_close(swapfile, NULL); + putname(pathname); + + return ret < 0 ? ret : nbytes; +} + +static int memcg_swapfile_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + if (memcg->swap_dev->type == SWAP_TYPE_NONE) + seq_printf(m, "none\n"); + else if (memcg->swap_dev->type == SWAP_TYPE_ALL) + seq_printf(m, "all\n"); + else + read_swapfile_for_memcg(m, memcg->swap_dev->type); + return 0; +} + +static void memcg_copy_swap_device(struct mem_cgroup *dst, + struct mem_cgroup *src) +{ + if (!src) { + dst->swap_dev->type = SWAP_TYPE_ALL; + dst->swap_dev->limit = PAGE_COUNTER_MAX; + } else { + dst->swap_dev->type = src->swap_dev->type; + dst->swap_dev->limit = src->swap_dev->limit; + } +} + +static ssize_t memcg_swapmax_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long limit; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &limit); + if (err) + return err; + + memcg->swap_dev->limit = limit; + + return nbytes; +} + +u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return memcg->swap_dev->limit * PAGE_SIZE; +} + +int memcg_get_swap_type(struct page *page) +{ + struct mem_cgroup *memcg; + int type; + + if (mem_cgroup_disabled() || !page) + return SWAP_TYPE_ALL; + + memcg = page_memcg(page); + if (!memcg || mem_cgroup_is_root(memcg)) + return SWAP_TYPE_ALL; + + rcu_read_lock(); + if (!css_tryget_online(&memcg->css)) { + rcu_read_unlock(); + return SWAP_TYPE_ALL; + } + rcu_read_unlock(); + + type = READ_ONCE(memcg->swap_dev->type); + css_put(&memcg->css); + return type; +} + +void memcg_remove_swapfile(int type) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + for_each_mem_cgroup(memcg) + if (memcg->swap_dev->type == type) + memcg->swap_dev->type = SWAP_TYPE_NONE; +} +#else +static int memcg_alloc_swap_device(struct mem_cgroup *memcg) +{ + return 0; +} + +static void memcg_free_swap_device(struct mem_cgroup *memcg) +{ +} + +static void memcg_copy_swap_device(struct mem_cgroup *dst, + struct mem_cgroup *src) +{ +} + +int memcg_get_swap_type(struct page *page) +{ + return SWAP_TYPE_ALL; +} + +void memcg_remove_swapfile(int type) +{ +} +#endif + static int memcg_high_async_ratio_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", @@ -5708,6 +5929,25 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "reclaim", .write = memory_reclaim, }, +#ifdef CONFIG_MEMCG_SWAP_QOS + { + .name = "force_swapin", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memory_swapin, + }, + { + .name = "swapfile", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapfile_write, + .seq_show = memcg_swapfile_read, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_swapmax_write, + .read_u64 = memcg_swapmax_read, + }, +#endif { .name = "high_async_ratio", .flags = CFTYPE_NOT_ON_ROOT, @@ -5852,6 +6092,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + memcg_free_swap_device(memcg); kfree(memcg); } @@ -5876,6 +6117,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return ERR_PTR(error); + if (memcg_alloc_swap_device(memcg)) + goto fail; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); @@ -5953,17 +6197,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + memcg_copy_swap_device(memcg, NULL); } else if (parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + memcg_copy_swap_device(memcg, parent); } else { page_counter_init(&memcg->memory, &root_mem_cgroup->memory); page_counter_init(&memcg->swap, &root_mem_cgroup->swap); page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); + memcg_copy_swap_device(memcg, root_mem_cgroup); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -7880,6 +8127,49 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) css_put(&memcg->css); } +#ifdef CONFIG_MEMCG_SWAP_QOS +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *target_memcg; + unsigned long swap_usage; + unsigned long swap_limit; + long nr_swap_pages = PAGE_COUNTER_MAX; + + if (cgroup_memory_noswap || !entry.val) + return 0; + + target_memcg = page_memcg(page); + if (!target_memcg || mem_cgroup_is_root(target_memcg)) + return 0; + + rcu_read_lock(); + if (!css_tryget_online(&target_memcg->css)) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + + for (memcg = target_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + swap_limit = READ_ONCE(memcg->swap_dev->limit); + swap_usage = page_counter_read(&memcg->memsw) - + page_counter_read(&memcg->memory); + nr_swap_pages = min_t(long, nr_swap_pages, + swap_limit - swap_usage); + } + css_put(&target_memcg->css); + + if (thp_nr_pages(page) > nr_swap_pages) + return -ENOMEM; + return 0; +} +#else +static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry) +{ + return 0; +} +#endif + /** * mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap @@ -7897,7 +8187,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) unsigned short oldid; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return 0; + return mem_cgroup_check_swap_for_v1(page, entry); memcg = page_memcg(page); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe706454754bfa09867d90dc4f1ecbe5a2e..5c37bd7841929c43b7a40e7aaa64245c3e630a67 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -35,6 +35,10 @@ #include static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +#ifdef CONFIG_MEMCG_SWAP_QOS +static unsigned int nr_swap_slots; +static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots); +#endif static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; @@ -111,7 +115,37 @@ static bool check_cache_active(void) return swap_slot_cache_active; } -static int alloc_swap_slot_cache(unsigned int cpu) +#ifdef CONFIG_MEMCG_SWAP_QOS +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return raw_cpu_ptr(&swp_slots); + else + return raw_cpu_ptr(&swp_type_slots)[swap_type]; +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + if (swap_type == SWAP_TYPE_ALL) + return &per_cpu(swp_slots, cpu); + else + return &per_cpu(swp_type_slots, cpu)[swap_type]; +} +#else +static inline struct swap_slots_cache *get_slots_cache(int swap_type) +{ + return raw_cpu_ptr(&swp_slots); +} + +static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu, + int swap_type) +{ + return &per_cpu(swp_slots, cpu); +} +#endif + +static int alloc_swap_slot_cache_cpu_type(unsigned int cpu, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots, *slots_ret; @@ -134,7 +168,7 @@ static int alloc_swap_slot_cache(unsigned int cpu) } mutex_lock(&swap_slots_cache_mutex); - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if (cache->slots || cache->slots_ret) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); @@ -166,13 +200,60 @@ static int alloc_swap_slot_cache(unsigned int cpu) return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +#ifdef CONFIG_MEMCG_SWAP_QOS +static int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + int i, ret; + + ret = alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); + if (ret) + return ret; + + for (i = 0; i < nr_swap_slots; i++) { + ret = alloc_swap_slot_cache_cpu_type(cpu, i); + if (ret) + return ret; + } + + return ret; +} + +static void alloc_swap_slot_cache_type(int type) +{ + unsigned int cpu; + + /* serialize with cpu hotplug operations */ + get_online_cpus(); + while (type >= nr_swap_slots) { + for_each_online_cpu(cpu) + alloc_swap_slot_cache_cpu_type(cpu, nr_swap_slots); + nr_swap_slots++; + } + put_online_cpus(); +} +#else +static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu) +{ + return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL); +} + +static void alloc_swap_slot_cache_type(int type) +{ +} +#endif + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + return __alloc_swap_slot_cache_cpu(cpu); +} + +static void drain_slots_cache_cpu_type(unsigned int cpu, unsigned int type, + bool free_slots, int swap_type) { struct swap_slots_cache *cache; swp_entry_t *slots = NULL; - cache = &per_cpu(swp_slots, cpu); + cache = get_slots_cache_cpu(cpu, swap_type); if ((type & SLOTS_CACHE) && cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); @@ -198,6 +279,30 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } } +#ifdef CONFIG_MEMCG_SWAP_QOS +static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + int i; + + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); + for (i = 0; i < nr_swap_slots; i++) + drain_slots_cache_cpu_type(cpu, type, free_slots, i); +} +#else +static inline void __drain_slots_cache_cpu(unsigned int cpu, + unsigned int type, bool free_slots) +{ + drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL); +} +#endif + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + __drain_slots_cache_cpu(cpu, type, free_slots); +} + static void __drain_swap_slots_cache(unsigned int type) { unsigned int cpu; @@ -237,7 +342,7 @@ static int free_slot_cache(unsigned int cpu) return 0; } -void enable_swap_slots_cache(void) +void enable_swap_slots_cache(int type) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -251,14 +356,14 @@ void enable_swap_slots_cache(void) swap_slot_cache_initialized = true; } - + alloc_swap_slot_cache_type(type); __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); } /* called with swap slot cache's alloc lock held */ -static int refill_swap_slots_cache(struct swap_slots_cache *cache) +static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type) { if (!use_swap_slot_cache || cache->nr) return 0; @@ -266,7 +371,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 1, type); return cache->nr; } @@ -307,12 +412,17 @@ swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; struct swap_slots_cache *cache; + int type; entry.val = 0; + type = memcg_get_swap_type(page); + if (type == SWAP_TYPE_NONE) + goto out; + if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, HPAGE_PMD_NR); + get_swap_pages(1, &entry, HPAGE_PMD_NR, type); goto out; } @@ -325,7 +435,7 @@ swp_entry_t get_swap_page(struct page *page) * The alloc path here does not touch cache->slots_ret * so cache->free_lock is not taken. */ - cache = raw_cpu_ptr(&swp_slots); + cache = get_slots_cache(type); if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); @@ -335,7 +445,7 @@ swp_entry_t get_swap_page(struct page *page) entry = cache->slots[cache->cur]; cache->slots[cache->cur++].val = 0; cache->nr--; - } else if (refill_swap_slots_cache(cache)) { + } else if (refill_swap_slots_cache(cache, type)) { goto repeat; } } @@ -344,7 +454,7 @@ swp_entry_t get_swap_page(struct page *page) goto out; } - get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 1, type); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e2396fa8a31aa99db03cd7e21386c974e605a6..e103ae583accab5a19197d04f9e3713590343473 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1056,7 +1056,83 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +#ifdef CONFIG_MEMCG_SWAP_QOS +int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) +{ + struct swap_info_struct *si; + unsigned int type; + int ret = -EINVAL; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + si = swap_info[type]; + if ((si->flags & SWP_WRITEOK) && + (si->swap_file->f_mapping == mapping)) { + *swap_type = type; + ret = 0; + break; + } + } + spin_unlock(&swap_lock); + return ret; +} + +void read_swapfile_for_memcg(struct seq_file *m, int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) { + seq_file_path(m, si->swap_file, "\t\n\\"); + seq_printf(m, "\n"); + } + } + spin_unlock(&swap_lock); +} + +static long get_avail_pages(unsigned long size, int type) +{ + struct swap_info_struct *si; + long avail_pgs = 0; + + if (type == SWAP_TYPE_ALL) + return atomic_long_read(&nr_swap_pages) / size; + + spin_unlock(&swap_avail_lock); + spin_lock(&swap_lock); + if (type < nr_swapfiles) { + si = swap_info[type]; + if (si->flags & SWP_WRITEOK) + avail_pgs = si->pages - si->inuse_pages; + } + spin_unlock(&swap_lock); + spin_lock(&swap_avail_lock); + return avail_pgs; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + if (type == SWAP_TYPE_ALL) + return false; + + return (type != swap_type); +} +#else +static inline long get_avail_pages(unsigned long size, int type) +{ + return atomic_long_read(&nr_swap_pages) / size; +} + +static inline bool should_skip_swap_type(int swap_type, int type) +{ + return false; +} +#endif + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, + int type) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; @@ -1069,7 +1145,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) spin_lock(&swap_avail_lock); - avail_pgs = atomic_long_read(&nr_swap_pages) / size; + avail_pgs = get_avail_pages(size, type); if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; @@ -1086,6 +1162,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); + if (should_skip_swap_type(si->type, type)) { + spin_unlock(&si->lock); + spin_lock(&swap_avail_lock); + goto nextsi; + } if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { @@ -2703,6 +2784,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) cluster_info = p->cluster_info; p->cluster_info = NULL; frontswap_map = frontswap_map_get(p); + memcg_remove_swapfile(p->type); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); @@ -3457,7 +3539,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (inode) inode_unlock(inode); if (!error) - enable_swap_slots_cache(); + enable_swap_slots_cache(p->type); return error; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a8412c5d4edace62f24231fcdd56a3f212ea91c1..2fce47af0e83b674e8b89df851c73483db7d94b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -102,6 +102,8 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Only swap anon pages */ + unsigned int only_swap:1; /* * Cgroup memory below memory.low is protected as long as we @@ -2461,6 +2463,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; + if (sc->only_swap) { + scan_balance = SCAN_ANON; + goto out; + } + /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; @@ -3563,6 +3570,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap) +{ + return __try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, + may_swap, false); +} + +unsigned long __try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + bool may_swap, bool only_swap) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3576,6 +3592,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = may_swap, + .only_swap = only_swap, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put