diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 45ba1f4dc004876f16a57fbcb34debd065769bc2..2025b524028c4359f5d06d01716cf7610d5d8778 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -75,6 +75,10 @@ Currently, these files are in /proc/sys/vm: - watermark_boost_factor - watermark_scale_factor - zone_reclaim_mode +- cache_reclaim_s +- cache_reclaim_weight +- cache_reclaim_enable +- cache_limit_mbytes admin_reserve_kbytes @@ -1044,3 +1048,42 @@ of other processes running on other nodes will not be affected. Allowing regular swap effectively restricts allocations to the local node unless explicitly overridden by memory policies or cpuset configurations. + +cache_reclaim_s +=============== + +Cache_reclaim_s is used to set reclaim interval in periodical memory +reclaim. when periodical memory reclaim is enabled, it will relcaim +memory in every cache_reclaim_s second. + + +cache_reclaim_weight +==================== + +This is reclaim factor in every periodical reclaim. when periodical +memory reclaim is enabled, the reclaim amount in every reclaim can +calculate from: + reclaim_amount = cache_reclaim_weigh * SWAP_CLUSTER_MAX * nr_cpus_node(nid) + +SWAP_CLUSTER_MAX is defined in include/linux/swap.h. +nr_cpus_node is used to obtain the number of CPUs on node nid. + +Memory reclaim use workqueue mechanism, it will block the execution of +subsequent work, if memory reclaim tasks a lot of time, time sensitive +work may be affected. + +Note that if the parameters are not configured properly, such as setting +too large a memory reclaim amount, it may lead to unstable system +performance. + +cache_reclaim_enable +==================== + +This is used to switch on/off periodical memory reclaim feature. + + +cache_limit_mbytes +================== + +This is used to set the upper limit of page cache in megabytes. +Page cache will be reclaimed periodically if page cache is over limit. diff --git a/mm/Kconfig b/mm/Kconfig index ff0c36f42ca8ec4c0c564057dc3ab54be502261f..df87fab4662150c60b9c2d1a5ef2074d6eb5cb43 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1323,6 +1323,19 @@ config ASCEND_OOM 0: disable oom killer 1: enable oom killer (default,compatible with mainline) +config PAGE_CACHE_LIMIT + bool "Support page cache limit" + depends on MMU && SYSCTL + default n + help + Keeping a number of page cache can improve the performance of system, + but if there is a lot fo page cache in system, that will result in + short of memory, subsequent memory reclamation operations may lead + to performance degradation, so add periodical memory relciam to + avoid too many page cache. + + if unsure, say N to disable the PAGE_CACHE_LIMIT. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 6921fedacd07e7b30125d221ea26675d073462dc..07cf74abd241929e7d24d0773dcd05b19f5c67b9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,3 +141,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o diff --git a/mm/internal.h b/mm/internal.h index bcb7f95783bf9045799dc389e0caeea1114b2028..1ebba69437d6f791994c7c447f1e176f0d7441b0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1157,4 +1157,8 @@ struct vma_prepare { void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); + +#ifdef CONFIG_PAGE_CACHE_LIMIT +unsigned long shrink_memory(unsigned long nr_to_reclaim, bool may_swap); +#endif /* CONFIG_PAGE_CACHE_LIMIT */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c new file mode 100644 index 0000000000000000000000000000000000000000..1ab00225f8ac2469543171c50d6f383dfbf6d1cf --- /dev/null +++ b/mm/page_cache_limit.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for periodic memory reclaim and page cache limit + */ + +#include +#include +#include +#include + +#include "internal.h" + +static int vm_cache_reclaim_s __read_mostly; +static int vm_cache_reclaim_s_max = 43200; +static int vm_cache_reclaim_weight __read_mostly = 1; +static int vm_cache_reclaim_weight_max = 100; +static int vm_cache_reclaim_enable = 1; +static unsigned long vm_cache_limit_mbytes __read_mostly; + +static void shrink_shepherd(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(shepherd, shrink_shepherd); +static struct work_struct vmscan_works[MAX_NUMNODES]; + +static bool should_periodical_reclaim(void) +{ + return vm_cache_reclaim_s && vm_cache_reclaim_enable; +} + +static unsigned long node_reclaim_num(void) +{ + int nid = numa_node_id(); + + return SWAP_CLUSTER_MAX * nr_cpus_node(nid) * vm_cache_reclaim_weight; +} + +static bool page_cache_over_limit(void) +{ + unsigned long lru_file; + unsigned long limit; + + limit = vm_cache_limit_mbytes << (20 - PAGE_SHIFT); + lru_file = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); + if (lru_file > limit) + return true; + + return false; +} + +static bool should_reclaim_page_cache(void) +{ + if (!should_periodical_reclaim()) + return false; + + if (!vm_cache_limit_mbytes) + return false; + + return true; +} + +int cache_reclaim_enable_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + if (should_periodical_reclaim()) + schedule_delayed_work(&shepherd, round_jiffies_relative( + (unsigned long)vm_cache_reclaim_s * HZ)); + + return 0; +} + +int cache_reclaim_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + if (should_periodical_reclaim()) + mod_delayed_work(system_unbound_wq, &shepherd, + round_jiffies_relative( + (unsigned long)vm_cache_reclaim_s * HZ)); + + return ret; +} + +int cache_limit_mbytes_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + unsigned long vm_cache_limit_mbytes_max; + unsigned long origin_mbytes = vm_cache_limit_mbytes; + int nr_retries = MAX_RECLAIM_RETRIES; + + vm_cache_limit_mbytes_max = totalram_pages() >> (20 - PAGE_SHIFT); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + if (vm_cache_limit_mbytes > vm_cache_limit_mbytes_max) { + vm_cache_limit_mbytes = origin_mbytes; + return -EINVAL; + } + + if (write) { + while (should_reclaim_page_cache() && page_cache_over_limit() && + nr_retries--) { + if (signal_pending(current)) + return -EINTR; + + shrink_memory(node_reclaim_num(), false); + } + } + + return 0; +} + +static void shrink_shepherd(struct work_struct *w) +{ + int node; + + if (!should_periodical_reclaim()) + return; + + for_each_online_node(node) { + if (!work_pending(&vmscan_works[node])) + queue_work_node(node, system_unbound_wq, &vmscan_works[node]); + } + + queue_delayed_work(system_unbound_wq, &shepherd, + round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); +} + +static void shrink_page_work(struct work_struct *w) +{ + shrink_memory(node_reclaim_num(), true); +} + +static void shrink_shepherd_timer(void) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_WORK(&vmscan_works[i], shrink_page_work); +} + +static struct ctl_table page_cache_limit_table[] = { + { + .procname = "cache_reclaim_s", + .data = &vm_cache_reclaim_s, + .maxlen = sizeof(vm_cache_reclaim_s), + .mode = 0644, + .proc_handler = cache_reclaim_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &vm_cache_reclaim_s_max, + }, + { + .procname = "cache_reclaim_weight", + .data = &vm_cache_reclaim_weight, + .maxlen = sizeof(vm_cache_reclaim_weight), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &vm_cache_reclaim_weight_max, + }, + { + .procname = "cache_reclaim_enable", + .data = &vm_cache_reclaim_enable, + .maxlen = sizeof(vm_cache_reclaim_enable), + .mode = 0644, + .proc_handler = cache_reclaim_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "cache_limit_mbytes", + .data = &vm_cache_limit_mbytes, + .maxlen = sizeof(vm_cache_limit_mbytes), + .mode = 0644, + .proc_handler = cache_limit_mbytes_sysctl_handler, + }, +}; + +static int __init shrink_page_init(void) +{ + shrink_shepherd_timer(); + + register_sysctl_init("vm", page_cache_limit_table); + + return 0; +} +late_initcall(shrink_page_init) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f13394b112eaea798ca50ff97fe5efa52747a3e..7a676296af30666d31ba5e0efbf069d600e20e3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7880,6 +7880,45 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) } #endif /* CONFIG_HIBERNATION */ +#ifdef CONFIG_PAGE_CACHE_LIMIT +unsigned long shrink_memory(unsigned long nr_to_reclaim, bool may_swap) +{ + unsigned long nr_reclaimed; + unsigned int noreclaim_flag; + int nid = numa_node_id(); + struct scan_control sc = { + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .reclaim_idx = ZONE_MOVABLE, + .may_writepage = !laptop_mode, + .nr_to_reclaim = nr_to_reclaim / 2, + .may_unmap = 1, + .may_swap = may_swap, + .priority = DEF_PRIORITY, + }; + + struct zonelist *zonelist = node_zonelist(nid, sc.gfp_mask); + struct scan_control orig_sc = sc; + + fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); + set_task_reclaim_state(current, &sc.reclaim_state); + + /* Start with ZONE_MOVABLE and try to reclaim half of the target memory */ + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + sc = orig_sc; + sc.reclaim_idx--; + + /* Then try to reclaim remain half memory starting from ZONE_NORMAL */ + nr_reclaimed += do_try_to_free_pages(zonelist, &sc); + + set_task_reclaim_state(current, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); + + return nr_reclaimed; +} +#endif /* CONFIG_PAGE_CACHE_LIMIT */ + /* * This kswapd start function will be called by init and node-hot-add. */