From 3286455e54dc6949ac45e6dc4c934158a7f42cbb Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 22 Nov 2022 21:28:03 +0800 Subject: [PATCH 1/4] mm/zswap: replace zswap_init_{started/failed} with zswap_init_state hulk inclusion category: performance bugzilla: 187468, https://gitee.com/openeuler/kernel/issues/I61HSG -------------------------------- zswap_init_started indicates that the initialization is started. And zswap_init_failed indicates that the initialization is failed. As we will support to init zswap after system startup, it's necessary to add a state to indicate the initialization is complete and succeed to avoid concurrency issues. Since we don't care about the difference between init started with init completion. We only need three states: uninitialized, initial failed, initial succeed. Signed-off-by: Liu Shixin --- mm/zswap.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 358f48b173dc..f8a763c3bc5d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -203,11 +203,12 @@ static DEFINE_SPINLOCK(zswap_pools_lock); /* pool counter to provide unique names to zpool */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); -/* used by param callback function */ -static bool zswap_init_started; +#define ZSWAP_UNINIT 0 +#define ZSWAP_INIT_SUCCEED 1 +#define ZSWAP_INIT_FAILED 2 -/* fatal error during init */ -static bool zswap_init_failed; +/* init state */ +static int zswap_init_state; /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; @@ -757,7 +758,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *s = strstrip((char *)val); int ret; - if (zswap_init_failed) { + if (zswap_init_state == ZSWAP_INIT_FAILED) { pr_err("can't set param, initialization failed\n"); return -ENODEV; } @@ -769,7 +770,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ - if (!zswap_init_started) + if (zswap_init_state == ZSWAP_UNINIT) return param_set_charp(s, kp); if (!type) { @@ -860,11 +861,11 @@ static int zswap_zpool_param_set(const char *val, static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { - if (zswap_init_failed) { + if (zswap_init_state == ZSWAP_INIT_FAILED) { pr_err("can't enable, initialization failed\n"); return -ENODEV; } - if (!zswap_has_pool && zswap_init_started) { + if (!zswap_has_pool && zswap_init_state == ZSWAP_INIT_SUCCEED) { pr_err("can't enable, no pool configured\n"); return -ENODEV; } @@ -1442,8 +1443,6 @@ static int __init init_zswap(void) struct zswap_pool *pool; int ret; - zswap_init_started = true; - if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); goto cache_fail; @@ -1481,6 +1480,7 @@ static int __init init_zswap(void) frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); + zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; fallback_fail: @@ -1492,7 +1492,7 @@ static int __init init_zswap(void) zswap_entry_cache_destroy(); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ - zswap_init_failed = true; + zswap_init_state = ZSWAP_INIT_FAILED; zswap_enabled = false; return -ENOMEM; } -- Gitee From 699bb9e8302081ce164c124fdfdd32b131b4f468 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 22 Nov 2022 21:28:04 +0800 Subject: [PATCH 2/4] mm/zswap: delay the initializaton of zswap until the first enablement hulk inclusion category: performance bugzilla: 187468, https://gitee.com/openeuler/kernel/issues/I61HSG -------------------------------- The zswap initialization consumes some memory. If we don't use zswap, there's no need to initial it. So delay the initializaton of zswap until the first enablement. Signed-off-by: Liu Shixin --- mm/zswap.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f8a763c3bc5d..a2d108b0e0a8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -79,6 +79,8 @@ static bool zswap_pool_reached_full; #define ZSWAP_PARAM_UNSET "" +static int zswap_setup(void); + /* Enable/disable zswap */ static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, @@ -209,6 +211,8 @@ static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* init state */ static int zswap_init_state; +/* used to ensure the integrity of initialization */ +static DEFINE_MUTEX(zswap_init_lock); /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; @@ -767,11 +771,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return 0; - /* if this is load-time (pre-init) param setting, + /* + * if zswap has not been initialized, * don't create a pool; that's done during init. */ - if (zswap_init_state == ZSWAP_UNINIT) - return param_set_charp(s, kp); + mutex_lock(&zswap_init_lock); + if (zswap_init_state == ZSWAP_UNINIT) { + ret = param_set_charp(s, kp); + mutex_unlock(&zswap_init_lock); + return ret; + } + mutex_unlock(&zswap_init_lock); if (!type) { if (!zpool_has_pool(s)) { @@ -861,6 +871,14 @@ static int zswap_zpool_param_set(const char *val, static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { + if (system_state == SYSTEM_RUNNING) { + mutex_lock(&zswap_init_lock); + if (zswap_setup()) { + mutex_unlock(&zswap_init_lock); + return -ENODEV; + } + mutex_unlock(&zswap_init_lock); + } if (zswap_init_state == ZSWAP_INIT_FAILED) { pr_err("can't enable, initialization failed\n"); return -ENODEV; @@ -1435,14 +1453,14 @@ static int __init zswap_debugfs_init(void) static void __exit zswap_debugfs_exit(void) { } #endif -/********************************* -* module init and exit -**********************************/ -static int __init init_zswap(void) +static int zswap_setup(void) { struct zswap_pool *pool; int ret; + if (zswap_init_state != ZSWAP_UNINIT) + return 0; + if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); goto cache_fail; @@ -1496,6 +1514,18 @@ static int __init init_zswap(void) zswap_enabled = false; return -ENOMEM; } + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + /* skip init if zswap is disabled when system startup */ + if (!zswap_enabled) + return 0; + return zswap_setup(); +} + /* must be late so crypto has time to come up */ late_initcall(init_zswap); -- Gitee From 1e44290e13f3fcd1f89355cc6560d29f73149518 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 22 Nov 2022 21:28:05 +0800 Subject: [PATCH 3/4] mm/zswap: remove _init in the initialization hulk inclusion category: bugfix bugzilla: 187468, https://gitee.com/openeuler/kernel/issues/I61HSG -------------------------------- As zswap can be init after system startup, the __init label should be removed. Signed-off-by: Liu Shixin --- mm/zswap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a2d108b0e0a8..d8e8d0084a22 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -266,13 +266,13 @@ static void zswap_update_total_size(void) **********************************/ static struct kmem_cache *zswap_entry_cache; -static int __init zswap_entry_cache_create(void) +static int zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); return zswap_entry_cache == NULL; } -static void __init zswap_entry_cache_destroy(void) +static void zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } @@ -653,7 +653,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return NULL; } -static __init struct zswap_pool *__zswap_pool_create_fallback(void) +static struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; @@ -1409,7 +1409,7 @@ static struct frontswap_ops zswap_frontswap_ops = { static struct dentry *zswap_debugfs_root; -static int __init zswap_debugfs_init(void) +static int zswap_debugfs_init(void) { if (!debugfs_initialized()) return -ENODEV; @@ -1445,7 +1445,7 @@ static void __exit zswap_debugfs_exit(void) debugfs_remove_recursive(zswap_debugfs_root); } #else -static int __init zswap_debugfs_init(void) +static int zswap_debugfs_init(void) { return 0; } -- Gitee From 49854662c5486624f6b4f61f043a9f684d4dc5d3 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 22 Nov 2022 21:28:06 +0800 Subject: [PATCH 4/4] mm/page_alloc.c: add sysctl to revise the batch and high of percpu pageset hulk inclusion category: performance bugzilla: 187468, https://gitee.com/openeuler/kernel/issues/I61HVC -------------------------------- Patch d8a759b57035 ("mm, page_alloc: double zone's batchsize") change the default batchsize. For some machines with large memory, the value seems to be too large. Although percpu_pagelist_fraction can be used to revise the batchsize but needs to be adjusted based on managed memory. So add a new sysctl percpu_max_batchsize to revise the batchsize to adapt different scenarios. Signed-off-by: Liu Shixin --- Documentation/admin-guide/sysctl/vm.rst | 10 ++++++ include/linux/mmzone.h | 3 ++ kernel/sysctl.c | 8 +++++ mm/page_alloc.c | 42 +++++++++++++++++++++++-- 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index a84bef7aa864..b508acfdde2e 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -65,6 +65,7 @@ Currently, these files are in /proc/sys/vm: - page-cluster - panic_on_oom - percpu_pagelist_fraction +- percpu_max_batchsize - stat_interval - stat_refresh - numa_stat @@ -856,6 +857,15 @@ the high water marks for each per cpu page list. If the user writes '0' to this sysctl, it will revert to this default behavior. +percpu_max_batchsize +======================== + +This is used to setup the max batch and high size of percpu in each zone. +The default value is set to (256 * 1024) / PAGE_SIZE. +The max value is limited to (512 * 1024) / PAGE_SIZE. +The min value is limited to (64 * 1024) / PAGE_SIZE. + + stat_interval ============= diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f25539d2fe4..0a70b4bdd236 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1009,6 +1009,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int percpu_max_batchsize_sysctl_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, @@ -1016,6 +1018,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; +extern int percpu_max_batchsize; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eb8db15b5902..372121d46029 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2993,6 +2993,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "percpu_max_batchsize", + .data = &percpu_max_batchsize, + .maxlen = sizeof(percpu_max_batchsize), + .mode = 0644, + .proc_handler = percpu_max_batchsize_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 274b68a147ea..d58ddd6e7f73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -112,6 +112,8 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) +#define MAX_PERCPU_MAX_BATCHSIZE ((512 * 1024) / PAGE_SIZE) +#define MIN_PERCPU_MAX_BATCHSIZE (MAX_PERCPU_MAX_BATCHSIZE / 8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -167,6 +169,8 @@ unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; +int percpu_max_batchsize = MAX_PERCPU_MAX_BATCHSIZE / 2; + gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON DEFINE_STATIC_KEY_TRUE(init_on_alloc); @@ -6757,10 +6761,9 @@ static int zone_batchsize(struct zone *zone) * size of the zone. */ batch = zone_managed_pages(zone) / 1024; - /* But no more than a meg. */ - if (batch * PAGE_SIZE > 1024 * 1024) - batch = (1024 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ + if (batch > percpu_max_batchsize) + batch = percpu_max_batchsize; if (batch < 1) batch = 1; @@ -8615,6 +8618,39 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, return ret; } +int percpu_max_batchsize_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_max_batchsize; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_max_batchsize = percpu_max_batchsize; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_max_batchsize > MAX_PERCPU_MAX_BATCHSIZE || + percpu_max_batchsize < MIN_PERCPU_MAX_BATCHSIZE) { + percpu_max_batchsize = old_percpu_max_batchsize; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_max_batchsize == old_percpu_max_batchsize) + goto out; + + for_each_populated_zone(zone) + zone_set_pageset_high_and_batch(zone); +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- Gitee