From ca1116a04b134a07921d5f2a4b3bef72901de13d Mon Sep 17 00:00:00 2001 From: Kaihao Bai Date: Sat, 7 Oct 2023 13:54:27 +0800 Subject: [PATCH 1/2] anolis: mm: support allocating page table pages binded with local numa node ANBZ: #6618 Currently page table can not be migrated through numa balancing. If most of page table pages are located on remote numa node, the performance can be degraded in some scenarios. Thus this patch provides a way to allocate page table pages in local numa node with reserved memory range. To switch on the pgtable bind feature globally: echo 2 > /sys/kernel/mm/pgtable_bind/enabled Only for misplaced page table pages statisitcs echo 1 > /sys/kernel/mm/pgtable_bind/enabled To enable the pgtable bind feature of cgroup level: echo 1 > /sys/fs/cgroup/memory//memory.pgtable_bind To get the amount of misplaced page table pages cat /sys/fs/cgroup/memory//memory.pgtable_misplaced Besides, if needs to reset the value of pgtable_misplaced echo 0 > /sys/fs/cgroup/memory//memory.pgtable_misplaced Signed-off-by: Kaihao Bai Reviewed-by: Xu Yu --- arch/arm64/mm/pgd.c | 23 +++++++++++ include/asm-generic/pgalloc.h | 66 ++++++++++++++++++++++++++++++++ include/linux/gfp.h | 4 ++ include/linux/memcontrol.h | 5 +++ include/linux/pgtable_bind.h | 32 ++++++++++++++++ include/trace/events/mmflags.h | 3 +- mm/Kconfig | 12 ++++++ mm/Makefile | 1 + mm/memcontrol.c | 58 +++++++++++++++++++++++++++- mm/page_alloc.c | 28 ++++++++++++++ mm/pgtable_bind.c | 70 ++++++++++++++++++++++++++++++++++ tools/perf/builtin-kmem.c | 1 + 12 files changed, 301 insertions(+), 2 deletions(-) create mode 100644 include/linux/pgtable_bind.h create mode 100644 mm/pgtable_bind.c diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 4a64089e5771..1150d4f38581 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include @@ -21,6 +23,27 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { gfp_t gfp = GFP_PGTABLE_USER; +#ifdef CONFIG_PGTABLE_BIND + if (pgtable_stat_enabled()) { + struct mem_cgroup *memcg; + bool pgtable_alloc = false; + + memcg = get_mem_cgroup_from_mm(mm); + if (memcg) { + pgtable_alloc = memcg->allow_pgtable_bind; + css_put(&memcg->css); + } + + /* Only target on user processes */ + if (pgtable_alloc) { + gfp |= __GFP_PGTABLE; + + if (pgtable_bind_enabled()) + gfp |= __GFP_HIGH | __GFP_THISNODE; + } + } +#endif + if (PGD_SIZE == PAGE_SIZE) return (pgd_t *)__get_free_page(gfp); else diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 034c87bc363c..b83e12ef3100 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -2,6 +2,9 @@ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H +#include +#include + #ifdef CONFIG_MMU #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_NOKFENCE) @@ -60,6 +63,27 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) { struct page *pte; +#ifdef CONFIG_PGTABLE_BIND + if (pgtable_stat_enabled()) { + struct mem_cgroup *memcg; + bool pgtable_alloc = false; + + memcg = get_mem_cgroup_from_mm(mm); + if (memcg) { + pgtable_alloc = memcg->allow_pgtable_bind; + css_put(&memcg->css); + } + + /* Only target on user processes */ + if (pgtable_alloc) { + gfp |= __GFP_PGTABLE; + + if (pgtable_bind_enabled()) + gfp |= __GFP_HIGH | __GFP_THISNODE; + } + } +#endif + pte = alloc_page(gfp); if (!pte) return NULL; @@ -121,6 +145,27 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) struct page *page; gfp_t gfp = GFP_PGTABLE_USER; +#ifdef CONFIG_PGTABLE_BIND + if (pgtable_stat_enabled()) { + struct mem_cgroup *memcg; + bool pgtable_alloc = false; + + memcg = get_mem_cgroup_from_mm(mm); + if (memcg) { + pgtable_alloc = memcg->allow_pgtable_bind; + css_put(&memcg->css); + } + + /* Only target on user processes */ + if (pgtable_alloc) { + gfp |= __GFP_PGTABLE; + + if (pgtable_bind_enabled()) + gfp |= __GFP_HIGH | __GFP_THISNODE; + } + } +#endif + if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; page = alloc_pages(gfp, 0); @@ -161,6 +206,27 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; +#ifdef CONFIG_PGTABLE_BIND + if (pgtable_stat_enabled()) { + struct mem_cgroup *memcg; + bool pgtable_alloc = false; + + memcg = get_mem_cgroup_from_mm(mm); + if (memcg) { + pgtable_alloc = memcg->allow_pgtable_bind; + css_put(&memcg->css); + } + + /* Only target on user processes */ + if (pgtable_alloc) { + gfp |= __GFP_PGTABLE; + + if (pgtable_bind_enabled()) + gfp |= __GFP_HIGH | __GFP_THISNODE; + } + } +#endif + if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; return (pud_t *)get_zeroed_page(gfp); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 404c50d81192..d0e96ff35a67 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -44,6 +44,7 @@ struct vm_area_struct; #else #define ___GFP_NOLOCKDEP 0 #endif +#define ___GFP_PGTABLE 0x4000000u #define ___GFP_NOKFENCE 0x8000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ @@ -87,6 +88,8 @@ struct vm_area_struct; * * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. * + * %__GFP_PGTABLE indicates the allocation of page table pages. + * * %__GFP_NOKFENCE informs DO NOT try to alloc page from kfence pool. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) @@ -94,6 +97,7 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) +#define __GFP_PGTABLE ((__force gfp_t)___GFP_PGTABLE) #define __GFP_NOKFENCE ((__force gfp_t)___GFP_NOKFENCE) /** diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 524255a9501f..4baa7dc04d3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -510,6 +510,11 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif +#ifdef CONFIG_PGTABLE_BIND + unsigned long pgtable_misplaced; + bool allow_pgtable_bind; +#endif + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) diff --git a/include/linux/pgtable_bind.h b/include/linux/pgtable_bind.h new file mode 100644 index 000000000000..919dfa188d39 --- /dev/null +++ b/include/linux/pgtable_bind.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PGTABLE_BIND_H_ +#define _LINUX_PGTABLE_BIND_H_ + +#include +#include + +#ifdef CONFIG_PGTABLE_BIND +DECLARE_STATIC_KEY_FALSE(pgtable_bind_enabled_key); +DECLARE_STATIC_KEY_FALSE(pgtable_stat_enabled_key); +static inline bool pgtable_bind_enabled(void) +{ + return static_key_enabled(&pgtable_bind_enabled_key); +} + +static inline bool pgtable_stat_enabled(void) +{ + return static_key_enabled(&pgtable_stat_enabled_key); +} +#else +static inline bool pgtable_bind_enabled(void) +{ + return false; +} + +static inline bool pgtable_stat_enabled(void) +{ + return false; +} +#endif + +#endif /* _LINUX_PGTABLE_BIND_H_ */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index c7dff5682053..70b50472199c 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -48,7 +48,8 @@ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"}\ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ + {(unsigned long)__GFP_PGTABLE, "__GFP_PGTABLE"} \ #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/mm/Kconfig b/mm/Kconfig index 7e52e0b34470..574c68d3cf53 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -982,6 +982,18 @@ config PAGECACHE_LIMIT If unsure, say N. +config PGTABLE_BIND + bool "Enable page table allocation binded with CPUs and misplaced statistics" + depends on MEMCG + default n + help + This feature is used to solve the problem that the page table pages can not be + migrated through numa balancing. If pages are located on remote numa node, the + performance can be degraded in some scenarios. Thus this configuration provides a + way to allocate page table pages in local numa node with reserved memory range. + + If unsure, say N. + # multi-gen LRU { config LRU_GEN bool "Multi-Gen LRU" diff --git a/mm/Makefile b/mm/Makefile index 68fc4505476b..812c82b26d2c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -136,3 +136,4 @@ obj-$(CONFIG_ASYNC_FORK) += async_fork.o obj-$(CONFIG_PAGE_PREZERO) += prezero.o obj-$(CONFIG_PAGECACHE_LIMIT) += pagecache_limit.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o +obj-$(CONFIG_PGTABLE_BIND) += pgtable_bind.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd3122bb3c35..eb610f727047 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6607,6 +6607,51 @@ static int mem_cgroup_allow_pgcache_sync_write(struct cgroup_subsys_state *css, } #endif /* CONFIG_PAGECACHE_LIMIT */ +#ifdef CONFIG_PGTABLE_BIND +static u64 memcg_pgtable_bind_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->allow_pgtable_bind); +} + +static int memcg_pgtable_bind_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val) + memcg->allow_pgtable_bind = true; + else + memcg->allow_pgtable_bind = false; + + return 0; +} + +static u64 memcg_pgtable_misplaced_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->pgtable_misplaced); +} + +static int memcg_pgtable_misplaced_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val) + return -EINVAL; + + /* reset the stat of current memcg */ + memcg->pgtable_misplaced = 0; + + return 0; +} +#endif /* CONFIG_PGTABLE_BIND */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int memcg_thp_reclaim_show(struct seq_file *m, void *v) { @@ -7200,7 +7245,18 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_thp_control_write, }, #endif - +#ifdef CONFIG_PGTABLE_BIND + { + .name = "pgtable_bind", + .write_u64 = memcg_pgtable_bind_write, + .read_u64 = memcg_pgtable_bind_read, + }, + { + .name = "pgtable_misplaced", + .write_u64 = memcg_pgtable_misplaced_write, + .read_u64 = memcg_pgtable_misplaced_read, + }, +#endif { }, /* terminate */ }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d64097ed3208..2d992892538d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4196,6 +4196,24 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); +#ifdef CONFIG_PGTABLE_BIND + /* + * If allocated page belongs to remote numa node, + * accumulate memcg->ck_reserved2 to show how many pages + * are from remote node. + */ + if ((gfp_mask & __GFP_PGTABLE) && + (zone_to_nid(ac->preferred_zoneref->zone) != zone_to_nid(zone))) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_mm(current->mm); + if (memcg) { + memcg->pgtable_misplaced++; + css_put(&memcg->css); + } + } +#endif + /* * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future @@ -5280,6 +5298,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, */ ac.nodemask = nodemask; + /* + * Restore the __GFP_THISNODE restriction if current allocation is page + * table. + */ + if (gfp_mask & __GFP_PGTABLE) { + gfp_mask &= ~__GFP_THISNODE; + alloc_mask &= ~__GFP_THISNODE; + ac.zonelist = node_zonelist(preferred_nid, gfp_mask); + } + page = __alloc_pages_slowpath(alloc_mask, order, &ac); out: diff --git a/mm/pgtable_bind.c b/mm/pgtable_bind.c new file mode 100644 index 000000000000..1f6caa54f1da --- /dev/null +++ b/mm/pgtable_bind.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PGTABLE_BIND +#ifdef CONFIG_SYSFS +DEFINE_STATIC_KEY_FALSE(pgtable_bind_enabled_key); +DEFINE_STATIC_KEY_FALSE(pgtable_stat_enabled_key); + +static ssize_t pgtable_bind_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !!static_key_enabled(&pgtable_bind_enabled_key) + + !!static_key_enabled(&pgtable_stat_enabled_key)); +} + +static ssize_t pgtable_bind_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + ssize_t ret = count; + + mutex_lock(&mutex); + + if (!strncmp(buf, "2", 1)) { + static_branch_enable(&pgtable_bind_enabled_key); + static_branch_enable(&pgtable_stat_enabled_key); + } else if (!strncmp(buf, "1", 1)) { + static_branch_disable(&pgtable_bind_enabled_key); + static_branch_enable(&pgtable_stat_enabled_key); + } else if (!strncmp(buf, "0", 1)) { + static_branch_disable(&pgtable_bind_enabled_key); + static_branch_disable(&pgtable_stat_enabled_key); + } + + mutex_unlock(&mutex); + return ret; +} + +static struct kobj_attribute pgtable_bind_enabled_attr = + __ATTR(enabled, 0644, pgtable_bind_enabled_show, + pgtable_bind_enabled_store); +static struct attribute *pgtable_bind_attrs[] = { + &pgtable_bind_enabled_attr.attr, + NULL, +}; +static const struct attribute_group pgtable_bind_attr_group = { + .attrs = pgtable_bind_attrs, + .name = "pgtable_bind", +}; + +static int __init pgtable_bind_init(void) +{ + int ret; + + ret = sysfs_create_group(mm_kobj, &pgtable_bind_attr_group); + if (ret) + pr_err("pgtable_bind: register sysfs failed\n"); + + return ret; +} +subsys_initcall(pgtable_bind_init); +#endif /* CONFIG_SYSFS */ +#endif /* CONFIG_PGTABLE_BIND */ diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 0062445e8ead..d614956cb826 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -660,6 +660,7 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, + { "__GFP_PGTABLE", "PT" }, }; static size_t max_gfp_len; -- Gitee From 60bbe4548bb320eb2044b0c6b8ac61376b33ff16 Mon Sep 17 00:00:00 2001 From: Kaihao Bai Date: Sat, 7 Oct 2023 17:41:37 +0800 Subject: [PATCH 2/2] anolis: configs: enable page table bind related configuration ANBZ: #6618 Enable page tabla bind related configuration of arm64/x86 architecture Signed-off-by: Kaihao Bai --- arch/arm64/configs/anolis-debug_defconfig | 1 + arch/arm64/configs/anolis_defconfig | 1 + arch/x86/configs/anolis-debug_defconfig | 1 + arch/x86/configs/anolis_defconfig | 1 + 4 files changed, 4 insertions(+) diff --git a/arch/arm64/configs/anolis-debug_defconfig b/arch/arm64/configs/anolis-debug_defconfig index e045a6594989..6f3ac186e92c 100644 --- a/arch/arm64/configs/anolis-debug_defconfig +++ b/arch/arm64/configs/anolis-debug_defconfig @@ -1072,6 +1072,7 @@ CONFIG_ASYNC_FORK=y # CONFIG_PAGE_PREZERO is not set CONFIG_TEXT_UNEVICTABLE=y CONFIG_PAGECACHE_LIMIT=y +CONFIG_PGTABLE_BIND=y CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_ENABLED is not set # CONFIG_LRU_GEN_STATS is not set diff --git a/arch/arm64/configs/anolis_defconfig b/arch/arm64/configs/anolis_defconfig index 88329cd14774..6b01a7065c9f 100644 --- a/arch/arm64/configs/anolis_defconfig +++ b/arch/arm64/configs/anolis_defconfig @@ -1092,6 +1092,7 @@ CONFIG_ASYNC_FORK=y # CONFIG_PAGE_PREZERO is not set CONFIG_TEXT_UNEVICTABLE=y CONFIG_PAGECACHE_LIMIT=y +CONFIG_PGTABLE_BIND=y CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_ENABLED is not set # CONFIG_LRU_GEN_STATS is not set diff --git a/arch/x86/configs/anolis-debug_defconfig b/arch/x86/configs/anolis-debug_defconfig index 7f6dfb578a79..5f5cb69f3ee6 100644 --- a/arch/x86/configs/anolis-debug_defconfig +++ b/arch/x86/configs/anolis-debug_defconfig @@ -1097,6 +1097,7 @@ CONFIG_ASYNC_FORK=y CONFIG_PAGE_PREZERO=y CONFIG_TEXT_UNEVICTABLE=y CONFIG_PAGECACHE_LIMIT=y +CONFIG_PGTABLE_BIND=y CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_ENABLED is not set # CONFIG_LRU_GEN_STATS is not set diff --git a/arch/x86/configs/anolis_defconfig b/arch/x86/configs/anolis_defconfig index 49399fabc0ec..a98da61a8fe3 100644 --- a/arch/x86/configs/anolis_defconfig +++ b/arch/x86/configs/anolis_defconfig @@ -1095,6 +1095,7 @@ CONFIG_ASYNC_FORK=y CONFIG_PAGE_PREZERO=y CONFIG_TEXT_UNEVICTABLE=y CONFIG_PAGECACHE_LIMIT=y +CONFIG_PGTABLE_BIND=y CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_ENABLED is not set # CONFIG_LRU_GEN_STATS is not set -- Gitee