diff --git a/Documentation/mm/dynamic_hugetlb.rst b/Documentation/mm/dynamic_hugetlb.rst new file mode 100644 index 0000000000000000000000000000000000000000..0136b775fdb7882c8c7f0d49b44f6e5f3eaad6b4 --- /dev/null +++ b/Documentation/mm/dynamic_hugetlb.rst @@ -0,0 +1,100 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Dynamic Hugetlb +=============== + +Overview +======== + +Dynamic hugetlb is a self-developed feature based on the hugetlb and memcontrol. +It supports to split huge page dynamically in a memory cgroup. There is a new structure +dhugetlb_pool in every mem_cgroup to manage the pages configured to the mem_cgroup. +For the mem_cgroup configured with dhugetlb_pool, processes in the mem_cgroup will +preferentially use the pages in dhugetlb_pool. + +Dynamic hugetlb supports three types of pages, including 1G/2M huge pages and 4K pages. +For the mem_cgroup configured with dhugetlb_pool, processes will be limited to alloc +1G/2M huge pages only from dhugetlb_pool. But there is no such constraint for 4K pages. +If there are insufficient 4K pages in the dhugetlb_pool, pages can also be allocated from +buddy system. So before using dynamic hugetlb, user must know how many huge pages they +need. + +Conflict +======== + +1. Conflict with THP +-------------------- + +When THP is enabled, the allocation of a page(order=0) may be converted to +an allocation of pages(order>0). In this case, the allocation will skip the +dhugetlb_pool. When we want to use dynamic hugetlb feature, we have to +disable THP for now. + +2. Conflict with hugetlb_vmemmap +-------------------------------- + +The dynamic_hugetlb feature need to split and merge pages frequently. +hugetlb_vmemmap will affects the perforemance of page split and merge. +If want to use dynamic hugetlb, please disable hugetlb_vmemmap. + +Usage +===== + +1) Add 'dynamic_hugetlb=on' in cmdline to enable dynamic hugetlb feature. + +2) Prealloc some 1G hugepages through hugetlb. + +3) Create a mem_cgroup and configure dhugetlb_pool to mem_cgroup. + +4) Configure the count of 1G/2M hugepages, and the remaining pages in dhugetlb_pool will + be used as basic pages. + +5) Bound the process to mem_cgroup. then the memory for it will be allocated from dhugetlb_pool. + +User control +============ + +1. dynamic_hugetlb= +------------------- + +Add ``dynamic_hugtlb=on`` in cmdline to enable dynamic hugetlb feature. +By default, the feature si disabled. + +2. dhugetlb.nr_pages +-------------------- + +In each memory cgroup, there is a ``dhugetlb.nr_pages`` interface used to create and configure dynamic +hugetlb. If this interface is not configured, the original functions are not affected. If configured, +then the memory used by processes in this memory cgroup will be allocated from corresponding hpool. + +Usage: + ``echo > /sys/fs/cgroup/memory//dhugetlb.nr_pages``: + + Create a dynamic hugetlb pool and add 1G hugepages from numa node to the pool. + + ``cat /sys/fs/cgroup/memory//dhugetlb.nr_pages``: + + Reads the memory information in the hpool, include the free amount and used amount of huge pages and + normal pages. + +3. dhugetlb.1G.reserved_pages +----------------------------- + +In each memory cgroup, there is a ``dhugetlb.nr_pages`` interface used to reserved 1G huge pages. +By default, all memory configured to a dynamic hugetlb pool can be used only as normal pages, if want to use +it as 1G huge pages, need to configure the number of 1G huge pages by this interface firstly. + +Usage: + ``echo > /sys/fs/cgroup/memory//dhugetlb.1G.reserved_pages`` + +4. dhugetlb.2M.reserved_pages +----------------------------- + +Similar to the previous interface, this is used to configure the number of 2M huge pages. + +Usage: + ``echo > /sys/fs/cgroup/memory//dhugetlb.2M.reserved_pages`` + +--- +Liu Shixin, Jan 2022 diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 703fe5bc2535dca3d1b4bbb75a4d1e368ff2f34b..22853245155ae1791c5685b4ee799e43208262b5 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1148,6 +1148,7 @@ CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_MEMORY_RELIABLE=y +CONFIG_DYNAMIC_POOL=y # # Data Access Monitoring diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index c9e816e54003230a5504f29d5427055a9a9fee06..276e8d4b7e10466ff7a21c7af10fbd138e3a2499 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1163,6 +1163,7 @@ CONFIG_LRU_GEN=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y +CONFIG_DYNAMIC_POOL=y # # Data Access Monitoring diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b0edf5fe8132b16197650eb7d94a3561b0331dec..86a56890ce0136b14314b634315529578606be67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1294,6 +1295,9 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) */ mpol_shared_policy_init(&p->policy, NULL); + /* Initialize hpool here in case of a quick call to destroy */ + dynamic_pool_bind_file(p, sbinfo->hstate); + return &p->vfs_inode; } @@ -1306,6 +1310,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + dynamic_pool_unbind_file(HUGETLBFS_I(inode)); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8f8cd1c60e0422e3d83a46d799860a0f2045e319..57a431c1130baa357398a5cabb7a8f73ef3c6962 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -18,6 +18,7 @@ #include #endif #include +#include #include #include "internal.h" @@ -170,6 +171,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) reliable_report_meminfo(m); + dynamic_pool_show_meminfo(m); + return 0; } diff --git a/fs/proc/page.c b/fs/proc/page.c index 195b077c0facbf8159b706361172c91a74c8815c..9a18d79e872453a2bb8953390618f584cad4f129 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -222,6 +222,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif +#ifdef CONFIG_DYNAMIC_POOL + u |= kpf_copy_bit(k, KPF_POOL, PG_pool); +#endif return u; }; diff --git a/include/linux/dynamic_pool.h b/include/linux/dynamic_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..88a7c06dcf052abd44bfde8902c59be6543a65b3 --- /dev/null +++ b/include/linux/dynamic_pool.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __LINUX_DYNAMIC_POOL_H +#define __LINUX_DYNAMIC_POOL_H + +#include +#include + +#ifdef CONFIG_DYNAMIC_POOL + +DECLARE_STATIC_KEY_FALSE(dynamic_pool_key); +#define dpool_enabled (static_branch_unlikely(&dynamic_pool_key)) + +enum pages_pool_type { + PAGES_POOL_1G, + PAGES_POOL_2M, + PAGES_POOL_4K, + PAGES_POOL_MAX, +}; + +struct split_page { + struct list_head entry; + unsigned long start_pfn; +}; + +struct pages_pool { + unsigned long free_pages; + long used_pages; + struct list_head freelist; + + /* Used for hugepage allocation */ + unsigned long nr_huge_pages; + unsigned long free_huge_pages; + unsigned long resv_huge_pages; + unsigned long used_huge_pages; + + /* Used for split page */ + unsigned long split_pages; + struct list_head splitlist; +}; + +struct pcp_pages_pool { + spinlock_t lock; + unsigned long free_pages; + long used_pages; + struct list_head freelist; +}; + +struct dynamic_pool_ops; + +struct dynamic_pool { + refcount_t refcnt; + bool online; + struct mem_cgroup *memcg; + struct dynamic_pool_ops *ops; + + spinlock_t lock; + struct pages_pool pool[PAGES_POOL_MAX]; + atomic_t pcp_refcnt; + struct pcp_pages_pool __percpu *pcp_pool; + + /* Used for dynamic hugetlb */ + int nid; + unsigned long total_pages; + + /* Used for dynamic pagelist */ + int range_cnt; + struct range *pfn_ranges; + unsigned long nr_poisoned_pages; +}; + +struct dpool_info { + struct mem_cgroup *memcg; + int range_cnt; + struct range pfn_ranges[0]; +}; + +bool __task_in_dynamic_pool(struct task_struct *tsk); +static inline bool task_in_dynamic_pool(struct task_struct *tsk) +{ + if (!dpool_enabled) + return false; + + return __task_in_dynamic_pool(tsk); +} + +static inline bool page_from_dynamic_pool(struct page *page) +{ + if (!dpool_enabled) + return false; + + return PagePool(page); +} + +static inline bool file_in_dynamic_pool(struct hugetlbfs_inode_info *p) +{ + if (!dpool_enabled) + return false; + + return p && p->dpool; +} + +bool page_in_dynamic_pool(struct page *page); +int dynamic_pool_can_attach(struct task_struct *tsk, struct mem_cgroup *memcg); +struct page *dynamic_pool_alloc_page(gfp_t gfp, unsigned int order, + unsigned int alloc_flags); +void dynamic_pool_free_page(struct page *page); +void dynamic_pool_bind_file(struct hugetlbfs_inode_info *p, struct hstate *h); +void dynamic_pool_unbind_file(struct hugetlbfs_inode_info *p); +int dynamic_pool_hugetlb_acct_memory(struct hstate *h, long delta, + struct hugetlbfs_inode_info *p); +struct folio *dynamic_pool_alloc_hugepage(struct hugetlbfs_inode_info *p, + struct hstate *h, bool reserved); +void dynamic_pool_free_hugepage(struct folio *folio, bool restore_reserve); + +void dynamic_pool_inherit(struct mem_cgroup *parent, struct mem_cgroup *memcg); +int dynamic_pool_destroy(struct cgroup *cgrp, bool *clear_css_online); + +bool dynamic_pool_hide_files(struct cftype *cft); +int dynamic_pool_add_memory(struct mem_cgroup *memcg, int nid, + unsigned long size); +void dynamic_pool_show(struct mem_cgroup *memcg, struct seq_file *m); +int dynamic_pool_reserve_hugepage(struct mem_cgroup *memcg, + unsigned long nr_pages, int type); + +int dpool_init(struct dpool_info *arg); +void dynamic_pool_show_meminfo(struct seq_file *m); + +#else +#define dpool_enabled 0 + +struct dynamic_pool {}; +struct dpool_info {}; + +static inline bool page_from_dynamic_pool(struct page *page) +{ + return false; +} + +static inline bool task_in_dynamic_pool(struct task_struct *tsk) +{ + return false; +} + +static inline bool page_in_dynamic_pool(const struct page *page) +{ + return false; +} + +static inline int dynamic_pool_can_attach(struct task_struct *tsk, + struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct page *dynamic_pool_alloc_page(gfp_t gfp, unsigned int order, + unsigned int alloc_flags) +{ + return NULL; +} + +static inline void dynamic_pool_free_page(struct page *page) +{ +} + +#ifdef CONFIG_HUGETLBFS +static inline bool file_in_dynamic_pool(struct hugetlbfs_inode_info *p) +{ + return false; +{ + +static inline void dynamic_pool_bind_file(struct hugetlbfs_inode_info *p, + struct hstate *h) +{ +} + +static inline void dynamic_pool_unbind_file(struct hugetlbfs_inode_info *p) +{ +} + +static inline int dynamic_pool_hugetlb_acct_memory(struct hstate *h, long delta, + struct hugetlbfs_inode_info *p) +{ + return -ENOMEM; +} + +static inline struct folio *dynamic_pool_alloc_hugepage(struct hugetlbfs_inode_info *p, + struct hstate *h, bool reserved) +{ + return NULL; +} + +static inline void dynamic_pool_free_hugepage(struct folio *folio, + bool restore_reserve) +{ +} +#endif + +static inline void dynamic_pool_inherit(struct mem_cgroup *parent, + struct mem_cgroup *memcg) +{ +} + +static inline int dynamic_pool_destroy(struct cgroup *cgrp, + bool *clear_css_online) +{ + return 0; +} + +#ifdef CONFIG_CGROUPS +static inline bool dynamic_pool_hide_files(struct cftype *cft) +{ + return false; +} +#endif + +static inline int dpool_init(struct dpool_info *arg) +{ + return 0; +} + +static inline void dynamic_pool_show_meminfo(struct seq_file *m) +{ +} +#endif /* CONFIG_DYNAMIC_POOL */ +#endif /* __LINUX_DYNAMIC_POOL_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d4cfefe576b55208a97caa9a544ab327219caad..f582d53492a8b3608a174a4f72e7974427827ae8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -282,6 +282,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio); +struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid); + #ifdef CONFIG_HUGETLB_INSERT_PAGE int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, pgprot_t prot, struct page *hpage); @@ -593,6 +596,9 @@ struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; unsigned int seals; +#ifdef CONFIG_DYNAMIC_POOL + struct dynamic_pool *dpool; +#endif }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) @@ -800,6 +806,11 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); +void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, + unsigned int order); +bool prep_compound_gigantic_folio_for_demote(struct folio *folio, + unsigned int order); +void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 287d130ee9690d903fcd09de8fa5b43ae2430937..649fbb5c1adc5c5ffcb06f7b548e8f83146be48c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -28,6 +28,7 @@ struct page; struct mm_struct; struct kmem_cache; struct oom_control; +struct dynamic_pool; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -364,6 +365,10 @@ struct mem_cgroup { struct swap_device *swap_dev; #endif +#ifdef CONFIG_DYNAMIC_POOL + struct dynamic_pool *dpool; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; @@ -883,6 +888,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); +void mem_cgroup_scan_cgroups(struct mem_cgroup *memcg, + void (*fn)(struct mem_cgroup *, void *), + void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { @@ -1217,6 +1225,19 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, int memcg_get_swap_type(struct folio *folio); void memcg_remove_swapfile(int type); +/* Test whether @memcg has children, dead or alive. */ +static inline bool memcg_has_children(struct mem_cgroup *memcg) +{ + bool ret; + + rcu_read_lock(); + ret = css_next_child(NULL, &memcg->css); + rcu_read_unlock(); + return ret; +} + +int mem_cgroup_force_empty(struct mem_cgroup *memcg); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a7164c67dcd851de1facd9b741fc1e028c8c1a84..0580ddf546fcaab325839df9549c9b5b78876d01 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -308,6 +308,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE +extern void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); @@ -316,6 +317,10 @@ extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); #else +static inline void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +{ +} + static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, diff --git a/include/linux/mm.h b/include/linux/mm.h index c00def598f9522ffb052fe3208a65847b306a20c..48a6b0865175b0ef398eebdd7448092a4265fe95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3965,6 +3965,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_FREE_DPOOL, MF_MSG_UNKNOWN, }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5c02720c53a5848e37f03beeadb2ac787a6a91f4..53060b67f5efc47692a413478061e116bf048b56 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -135,6 +135,9 @@ enum pageflags { #ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, PG_arch_3, +#endif +#ifdef CONFIG_DYNAMIC_POOL + PG_pool, /* Page is allocated from dynamic pool */ #endif __NR_PAGEFLAGS, @@ -603,6 +606,15 @@ PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif +/* + * PagePool() is used to track page allocated from dpool. + */ +#ifdef CONFIG_DYNAMIC_POOL +PAGEFLAG(Pool, pool, PF_NO_TAIL) +#else +PAGEFLAG_FALSE(Pool, pool) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; @@ -921,6 +933,9 @@ static inline bool is_page_hwpoison(struct page *page) #define PG_offline 0x00000100 #define PG_table 0x00000200 #define PG_guard 0x00000400 +#ifdef CONFIG_DYNAMIC_POOL +#define PG_dpool 0x00000800 +#endif #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -1012,6 +1027,13 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) +#ifdef CONFIG_DYNAMIC_POOL +/* + * PageDpool() indicates that the page is free and in the dpool. + */ +PAGE_TYPE_OPS(Dpool, dpool, dpool) +#endif + extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index cbd3ddd7c33d4d12326bafbc3c33c60c6317358e..e7d9470a27cd698693094561f4665a3ad3ba630c 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -373,6 +373,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ + EM ( MF_MSG_FREE_DPOOL, "free dynamic pool page" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/include/trace/events/dynamic_pool.h b/include/trace/events/dynamic_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..f20360e0f00f785300853950f7193ffad4d35900 --- /dev/null +++ b/include/trace/events/dynamic_pool.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dynamic_pool + +#if !defined(_TRACE_DPOOL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DPOOL_H + +#include +#include +#include + +#define show_size(type) \ + __print_symbolic(type, \ + { PAGES_POOL_1G, "1G" }, \ + { PAGES_POOL_2M, "2M" }, \ + { PAGES_POOL_4K, "4K" }) + +TRACE_EVENT(dpool_demote, + + TP_PROTO(struct dynamic_pool *dpool, int type, struct page *page, + int ret), + + TP_ARGS(dpool, type, page, ret), + + TP_STRUCT__entry( + __field(struct dynamic_pool *, dpool) + __field(int, type) + __field(unsigned long, pfn) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dpool = dpool; + __entry->type = type; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->ret = ret; + ), + + TP_printk("dpool=%p size=%s page=%p pfn=%lx ret=%d", + __entry->dpool, + show_size(__entry->type), + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn, + __entry->ret) +); + +TRACE_EVENT(dpool_promote, + + TP_PROTO(struct dynamic_pool *dpool, int type, struct page *page, + int ret), + + TP_ARGS(dpool, type, page, ret), + + TP_STRUCT__entry( + __field(struct dynamic_pool *, dpool) + __field(int, type) + __field(unsigned long, pfn) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dpool = dpool; + __entry->type = type; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->ret = ret; + ), + + TP_printk("dpool=%p size=%s page=%p pfn=%lx ret=%d", + __entry->dpool, + show_size(__entry->type), + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn, + __entry->ret) +); + +TRACE_EVENT(dpool_acct_memory, + + TP_PROTO(struct dynamic_pool *dpool, int type, long delta, + unsigned long resv, int ret), + + TP_ARGS(dpool, type, delta, resv, ret), + + TP_STRUCT__entry( + __field(struct dynamic_pool *, dpool) + __field(int, type) + __field(long, delta) + __field(unsigned long, resv) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dpool = dpool; + __entry->type = type; + __entry->delta = delta; + __entry->resv = resv; + __entry->ret = ret; + ), + + TP_printk("dpool=%p size=%s delta=%ld resv=%lu ret=%d", + __entry->dpool, + show_size(__entry->type), + __entry->delta, + __entry->resv, + __entry->ret) +); + +TRACE_EVENT(dpool_alloc_hugepage, + + TP_PROTO(struct dynamic_pool *dpool, int type, struct folio *folio, + unsigned long free, unsigned long resv), + + TP_ARGS(dpool, type, folio, free, resv), + + TP_STRUCT__entry( + __field(struct dynamic_pool *, dpool) + __field(int, type) + __field(unsigned long, pfn) + __field(unsigned long, free) + __field(unsigned long, resv) + ), + + TP_fast_assign( + __entry->dpool = dpool; + __entry->type = type; + __entry->pfn = folio ? folio_pfn(folio) : -1UL; + __entry->free = free; + __entry->resv = resv; + ), + + TP_printk("dpool=%p size=%s page=%p pfn=%lx free=%lu resv=%lu", + __entry->dpool, + show_size(__entry->type), + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn, + __entry->free, + __entry->resv) +); + +TRACE_EVENT(dpool_free_hugepage, + + TP_PROTO(struct dynamic_pool *dpool, int type, struct folio *folio, + unsigned long free, unsigned long resv), + + TP_ARGS(dpool, type, folio, free, resv), + + TP_STRUCT__entry( + __field(struct dynamic_pool *, dpool) + __field(int, type) + __field(unsigned long, pfn) + __field(unsigned long, free) + __field(unsigned long, resv) + ), + + TP_fast_assign( + __entry->dpool = dpool; + __entry->type = type; + __entry->pfn = folio ? folio_pfn(folio) : -1UL; + __entry->free = free; + __entry->resv = resv; + ), + + TP_printk("dpool=%p size=%s page=%p pfn=%lx free=%lu resv=%lu", + __entry->dpool, + show_size(__entry->type), + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn, + __entry->free, + __entry->resv) +); + +#endif /* _TRACE_DPOOL_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 6e24b2fbc4458f4ad012e4916aa30d7120ed212a..37962289a7a55058170f74bf71f04d932b197d0f 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -90,6 +90,12 @@ #define IF_HAVE_PG_IDLE(_name) #endif +#ifdef CONFIG_DYNAMIC_POOL +#define IF_HAVE_PG_POOL(_name) ,{1UL << PG_##_name, __stringify(_name)} +#else +#define IF_HAVE_PG_POOL(_name) +#endif + #ifdef CONFIG_ARCH_USES_PG_ARCH_X #define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)} #else @@ -125,6 +131,7 @@ IF_HAVE_PG_UNCACHED(uncached) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ IF_HAVE_PG_IDLE(young) \ +IF_HAVE_PG_POOL(pool) \ IF_HAVE_PG_ARCH_X(arch_2) \ IF_HAVE_PG_ARCH_X(arch_3) diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 6f2f2720f3ac2697ebcad1e0f963bf350bc14676..f8297cb68bdd4ec5a6f3de9f94b94a3a56f75f5d 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -36,5 +36,6 @@ #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 +#define KPF_POOL 27 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e6e876fa440242d305978fad4e4564c0867631c1..e342774b9215840b00d7aa0b94a69e1a315434ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -4329,6 +4330,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; + if (dynamic_pool_hide_files(cft)) + continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { @@ -5965,6 +5968,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; + bool clear_css_online = false; lockdep_assert_held(&cgroup_mutex); @@ -5983,6 +5987,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (css_has_online_children(&cgrp->self)) return -EBUSY; + /* + * If dynamic pool is enabled, make sure dpool is destroyed before + * removing the corresponding memory cgroup. If CSS_ONLINE is set, + * this function will clear it and set clear_css_online to true. + */ + if (dynamic_pool_destroy(cgrp, &clear_css_online)) + return -EBUSY; + /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling diff --git a/mm/Kconfig b/mm/Kconfig index 2df11b146c8402b69c6671f96df0f41c390c817d..82dbe6c28fcb3501dd678f47ac72e21d8af96d5c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1365,6 +1365,15 @@ config MEMORY_RELIABLE To enable this function, mirrored memory is needed and "kernelcore=reliable" need to be added in kernel parameters. +config DYNAMIC_POOL + bool "Dynamic Pool support" + depends on X86_64 || (ARM64 && ARM64_4K_PAGES) + depends on MEMCG && HUGETLB_PAGE + default n + help + A per-memcg pagepool. The task in the memcg will prefer to alloc + pages from corresponding pool. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index e1a853e318565a01cc22f817f4db5cda58419b23..8d7d2aeda6eab22e6fc43a147466444a299fb158 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -145,3 +145,4 @@ obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o +obj-$(CONFIG_DYNAMIC_POOL) += dynamic_pool.o diff --git a/mm/dynamic_pool.c b/mm/dynamic_pool.c new file mode 100644 index 0000000000000000000000000000000000000000..88cb3333ba9f3b0b88065013053a1765abaaa3c1 --- /dev/null +++ b/mm/dynamic_pool.c @@ -0,0 +1,1722 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * dynamic pool core file + * + * Copyright (C) 2024 Huawei Limited. + */ + +#define pr_fmt(fmt) "Dynamic pool: " fmt + +#include +#include +#include "internal.h" +#include "hugetlb_vmemmap.h" + +#define CREATE_TRACE_POINTS +#include + +static bool enable_dhugetlb; +static bool enable_dpagelist; + +/* Indicate the enabled of dynamic pool */ +DEFINE_STATIC_KEY_FALSE(dynamic_pool_key); + +/* Protect the operation of dynamic pool */ +static DEFINE_MUTEX(dpool_mutex); + +/* Introduce the special opeartion. */ +struct dynamic_pool_ops { + int (*fill_pool)(struct dynamic_pool *dpool, void *arg); + int (*drain_pool)(struct dynamic_pool *dpool); + int (*restore_pool)(struct dynamic_pool *dpool); +}; + +/* Used to record the mapping of page and dpool */ +struct dpool_page_array { + unsigned long count; + struct dynamic_pool *dpool[]; +}; + +#define DEFAULT_PAGE_ARRAY_COUNT 4096 +#define hugepage_index(pfn) ((pfn) >> PUD_ORDER) +static struct dpool_page_array *dpool_page_array; +static DEFINE_RWLOCK(dpool_page_array_rwlock); + +/* For dpagelist, there are only one dpool */ +static struct dynamic_pool *dpool_global_pool; + +/* Used for percpu pages pool */ +#define PCP_PAGE_MAX 1024 +#define PCP_PAGE_BATCH (PCP_PAGE_MAX >> 2) + +/* === reference function ============================================= */ + +static bool dpool_get_unless_zero(struct dynamic_pool *dpool) +{ + if (!dpool) + return false; + + return refcount_inc_not_zero(&dpool->refcnt); +} + +static void dpool_put(struct dynamic_pool *dpool) +{ + if (!dpool) + return; + + if (refcount_dec_and_test(&dpool->refcnt)) { + dpool->memcg->dpool = NULL; + css_put(&dpool->memcg->css); + dpool_global_pool = NULL; + synchronize_rcu(); + free_percpu(dpool->pcp_pool); + kfree(dpool->pfn_ranges); + kfree(dpool); + } +} + +static struct dynamic_pool *dpool_get_from_memcg(struct mem_cgroup *memcg) +{ + struct dynamic_pool *dpool; + + rcu_read_lock(); + dpool = memcg->dpool; + if (!dpool_get_unless_zero(dpool)) + dpool = NULL; + rcu_read_unlock(); + + return dpool; +} + +static struct dynamic_pool *dpool_get_from_task(struct task_struct *tsk) +{ + struct dynamic_pool *dpool = NULL; + struct mem_cgroup *memcg; + + if (!dpool_enabled) + return NULL; + + rcu_read_lock(); + do { + memcg = mem_cgroup_from_task(tsk); + } while (memcg && !css_tryget(&memcg->css)); + rcu_read_unlock(); + if (!memcg) + return NULL; + + dpool = dpool_get_from_memcg(memcg); + css_put(&memcg->css); + + return dpool; +} + +static struct dynamic_pool *dpool_get_from_page(struct page *page) +{ + struct dynamic_pool *dpool = NULL; + unsigned long idx; + + rcu_read_lock(); + if (enable_dhugetlb) { + idx = hugepage_index(page_to_pfn(page)); + read_lock(&dpool_page_array_rwlock); + if (idx < dpool_page_array->count) + dpool = dpool_page_array->dpool[idx]; + read_unlock(&dpool_page_array_rwlock); + } else if (enable_dpagelist) { + /* + * Attention: dpool_global_pool return for any page, + * so need other check to make sure it is from dpool. + */ + dpool = dpool_global_pool; + } + + if (!dpool_get_unless_zero(dpool)) + dpool = NULL; + rcu_read_unlock(); + + return dpool; +} + +bool __task_in_dynamic_pool(struct task_struct *tsk) +{ + struct dynamic_pool *dpool; + + if (!dpool_enabled) + return false; + + dpool = dpool_get_from_task(tsk); + dpool_put(dpool); + + return !!dpool; +} + +bool page_in_dynamic_pool(struct page *page) +{ + struct dynamic_pool *dpool; + + if (!dpool_enabled) + return false; + + if (PageDpool(page)) + return true; + + /* + * If the page don't have the flags, it may be in pcp list. + * Check it using the page range. + */ + dpool = dpool_get_from_page(page); + if (enable_dpagelist && dpool) { + unsigned long pfn = page_to_pfn(page); + int range_cnt = dpool->range_cnt; + struct range *range; + int i; + + for (i = 0; i < range_cnt; i++) { + range = &dpool->pfn_ranges[i]; + if (pfn >= range->start && pfn <= range->end) + goto put; + } + + /* The pfn is not in the range, set dpool to NULL */ + dpool = NULL; + } +put: + dpool_put(dpool); + + return !!dpool; +} + +/* === demote and promote function ==================================== */ + +static void dpool_disable_pcp_pool(struct dynamic_pool *dpool, bool drain); +static void dpool_enable_pcp_pool(struct dynamic_pool *dpool); + +/* + * Clear compound structure which is inverse of prep_compound_page, + * For detail, see destroy_compound_hugetlb_folio_for_demote. + */ +static void clear_compound_page(struct folio *folio, unsigned int order) +{ + int i; + int nr_pages = 1 << order; + struct page *p; + + atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_pincount, 0); + + for (i = 0; i < nr_pages; i++) { + p = folio_page(folio, i); + p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; + p->mapping = NULL; + if (!i) + __ClearPageHead(p); + else + clear_compound_head(p); + set_page_private(p, 0); + } +} + +static int dpool_demote_gigantic_page(struct pages_pool *src_pool, + struct pages_pool *dst_pool, + struct page *page) +{ + struct folio *folio = page_folio(page); + struct hstate *h = size_to_hstate(PMD_SIZE); + int nr_pages = 1 << PUD_ORDER; + int block_size = 1 << PMD_ORDER; + struct page *subpage; + int i; + + if (PageHWPoison(page)) + return -EHWPOISON; + + list_del(&page->lru); + __ClearPageDpool(page); + src_pool->free_pages--; + + destroy_compound_hugetlb_folio_for_demote(folio, PUD_ORDER); + + for (i = 0; i < nr_pages; i += block_size) { + subpage = folio_page(folio, i); + prep_compound_page(subpage, PMD_ORDER); + folio_change_private(page_folio(subpage), NULL); + __SetPageDpool(subpage); + __prep_new_hugetlb_folio(h, page_folio(subpage)); + list_add_tail(&subpage->lru, &dst_pool->freelist); + dst_pool->free_pages++; + } + + return 0; +} + +static int dpool_demote_huge_page(struct pages_pool *src_pool, + struct pages_pool *dst_pool, + struct page *page) +{ + struct folio *folio = page_folio(page); + int nr_pages = 1 << PMD_ORDER; + struct page *subpage; + int i; + + if (PageHWPoison(page)) + return -EHWPOISON; + + list_del(&page->lru); + __ClearPageDpool(page); + src_pool->free_pages--; + + clear_compound_page(page_folio(page), PMD_ORDER); + for (i = 0; i < nr_pages; i++) { + subpage = folio_page(folio, i); + free_pages_prepare(subpage, 0, 0); + __SetPageDpool(subpage); + list_add_tail(&subpage->lru, &dst_pool->freelist); + dst_pool->free_pages++; + } + + return 0; +} + +static int dpool_demote_pool_locked(struct dynamic_pool *dpool, int type) +{ + struct pages_pool *src_pool, *dst_pool; + struct split_page *spage = NULL; + struct page *page = NULL; + int ret = -ENOMEM; + + lockdep_assert_held(&dpool->lock); + + if (type < 0 || type >= PAGES_POOL_MAX - 1) + return -EINVAL; + + src_pool = &dpool->pool[type]; + dst_pool = &dpool->pool[type + 1]; + + spage = kzalloc(sizeof(struct split_page), GFP_ATOMIC); + if (!spage) + goto out; + + if (!src_pool->free_pages && dpool_demote_pool_locked(dpool, type - 1)) + goto out; + + list_for_each_entry(page, &src_pool->freelist, lru) { + switch (type) { + case PAGES_POOL_1G: + ret = dpool_demote_gigantic_page(src_pool, dst_pool, page); + break; + case PAGES_POOL_2M: + ret = dpool_demote_huge_page(src_pool, dst_pool, page); + break; + default: + BUG(); + } + if (!ret) + break; + } + +out: + if (!ret) { + spage->start_pfn = page_to_pfn(page); + list_add(&spage->entry, &src_pool->splitlist); + src_pool->split_pages++; + } else { + kfree(spage); + } + trace_dpool_demote(dpool, type, page, ret); + + return ret; +} + +static int dpool_promote_gigantic_page(struct pages_pool *src_pool, + struct pages_pool *dst_pool, + struct split_page *spage) +{ + struct hstate *h = size_to_hstate(PUD_SIZE); + int nr_pages = 1 << PUD_ORDER; + int block_size = 1 << PMD_ORDER; + struct page *page, *subpage; + int i; + + for (i = 0; i < nr_pages; i += block_size) { + subpage = pfn_to_page(spage->start_pfn + i); + if (!PageDpool(subpage)) + return -EBUSY; + + if (PageHWPoison(subpage)) + return -EHWPOISON; + } + + for (i = 0; i < nr_pages; i += block_size) { + subpage = pfn_to_page(spage->start_pfn + i); + clear_compound_page(page_folio(subpage), PMD_ORDER); + __ClearPageDpool(subpage); + list_del(&subpage->lru); + src_pool->free_pages--; + } + + page = pfn_to_page(spage->start_pfn); + prep_compound_gigantic_folio_for_demote(page_folio(page), PUD_ORDER); + folio_change_private(page_folio(page), NULL); + __SetPageDpool(page); + __prep_new_hugetlb_folio(h, page_folio(page)); + list_add_tail(&page->lru, &dst_pool->freelist); + dst_pool->free_pages++; + + return 0; +} + +static int dpool_promote_huge_page(struct pages_pool *src_pool, + struct pages_pool *dst_pool, + struct split_page *spage) +{ + struct hstate *h = size_to_hstate(PMD_SIZE); + int nr_pages = 1 << PMD_ORDER; + struct page *page, *subpage; + int i; + + for (i = 0; i < nr_pages; i++) { + subpage = pfn_to_page(spage->start_pfn + i); + if (!PageDpool(subpage)) + return -EBUSY; + + if (PageHWPoison(subpage)) + return -EHWPOISON; + } + + for (i = 0; i < nr_pages; i++) { + subpage = pfn_to_page(spage->start_pfn + i); + __ClearPageDpool(subpage); + list_del(&subpage->lru); + src_pool->free_pages--; + } + + page = pfn_to_page(spage->start_pfn); + prep_new_page(page, PMD_ORDER, __GFP_COMP, 0); + set_page_count(page, 0); + folio_change_private(page_folio(page), NULL); + __SetPageDpool(page); + __prep_new_hugetlb_folio(h, page_folio(page)); + list_add_tail(&page->lru, &dst_pool->freelist); + dst_pool->free_pages++; + + return 0; +} + +static int dpool_promote_pool(struct dynamic_pool *dpool, int type) +{ + struct pages_pool *src_pool, *dst_pool; + struct split_page *spage, *spage_next; + struct page *page = NULL; + int ret = -ENOMEM; + + + if (type < 0 || type >= PAGES_POOL_MAX - 1) + return -EINVAL; + + src_pool = &dpool->pool[type + 1]; + dst_pool = &dpool->pool[type]; + + spin_lock(&dpool->lock); + + if (!dst_pool->split_pages) + goto unlock; + + list_for_each_entry_safe(spage, spage_next, &dst_pool->splitlist, entry) { + switch (type) { + case PAGES_POOL_1G: + ret = dpool_promote_gigantic_page(src_pool, dst_pool, spage); + break; + case PAGES_POOL_2M: { + unsigned long nr_pages = 1 << PMD_ORDER; + + /* + * Since the dpool_mutex is already locked, + * there is no way to free spage_next, so + * it is safe to unlock here. + */ + spin_unlock(&dpool->lock); + cond_resched(); + lru_add_drain_all(); + dpool_disable_pcp_pool(dpool, true); + do_migrate_range(spage->start_pfn, + spage->start_pfn + nr_pages); + spin_lock(&dpool->lock); + dpool_enable_pcp_pool(dpool); + ret = dpool_promote_huge_page(src_pool, dst_pool, spage); + break; + } + default: + BUG(); + } + if (!ret) + break; + } + + if (!ret) { + page = pfn_to_page(spage->start_pfn); + list_del(&spage->entry); + dst_pool->split_pages--; + } + +unlock: + spin_unlock(&dpool->lock); + if (!ret) + kfree(spage); + trace_dpool_promote(dpool, type, page, ret); + + return ret; +} + +/* === percpu pool function =========================================== */ + +static void dpool_refill_pcp_pool(struct dynamic_pool *dpool, + struct pcp_pages_pool *pcp_pool, + unsigned long count) +{ + struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; + struct page *page, *next; + int i = 0; + + lockdep_assert_held(&pcp_pool->lock); + + spin_lock(&dpool->lock); + + if (!pool->free_pages && dpool_demote_pool_locked(dpool, PAGES_POOL_2M)) + goto unlock; + + list_for_each_entry_safe(page, next, &pool->freelist, lru) { + list_move_tail(&page->lru, &pcp_pool->freelist); + __ClearPageDpool(page); + pool->free_pages--; + pcp_pool->free_pages++; + if (++i == count) + break; + } + +unlock: + spin_unlock(&dpool->lock); +} + +static void dpool_drain_pcp_pool(struct dynamic_pool *dpool, + struct pcp_pages_pool *pcp_pool, + unsigned long count) +{ + struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; + struct page *page, *next; + int i = 0; + + lockdep_assert_held(&pcp_pool->lock); + + spin_lock(&dpool->lock); + list_for_each_entry_safe(page, next, &pcp_pool->freelist, lru) { + list_move_tail(&page->lru, &pool->freelist); + __SetPageDpool(page); + pcp_pool->free_pages--; + pool->free_pages++; + if (++i == count) + break; + } + + pool->used_pages += pcp_pool->used_pages; + pcp_pool->used_pages = 0; + spin_unlock(&dpool->lock); +} + +static void dpool_drain_all_pcp_pool(struct dynamic_pool *dpool) +{ + struct pcp_pages_pool *pcp_pool; + unsigned long flags; + int cpu; + + for_each_possible_cpu(cpu) { + pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); + spin_lock_irqsave(&pcp_pool->lock, flags); + dpool_drain_pcp_pool(dpool, pcp_pool, pcp_pool->free_pages); + spin_unlock_irqrestore(&pcp_pool->lock, flags); + } +} + +static void dpool_wait_all_pcp_pool_unlock(struct dynamic_pool *dpool) +{ + struct pcp_pages_pool *pcp_pool; + unsigned long flags; + int cpu; + + for_each_possible_cpu(cpu) { + pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); + spin_lock_irqsave(&pcp_pool->lock, flags); + spin_unlock_irqrestore(&pcp_pool->lock, flags); + } +} + + +/* The caller have to make sure no others write the count */ +static void dpool_sum_pcp_pool(struct dynamic_pool *dpool, + unsigned long *free_pages, long *used_pages) +{ + struct pcp_pages_pool *pcp_pool; + int cpu; + + *free_pages = 0; + *used_pages = 0; + for_each_possible_cpu(cpu) { + pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); + *free_pages += pcp_pool->free_pages; + *used_pages += pcp_pool->used_pages; + } +} + +static void dpool_disable_pcp_pool(struct dynamic_pool *dpool, bool drain) +{ + atomic_inc(&dpool->pcp_refcnt); + /* After increase refcount, wait for other user to unlock. */ + if (drain) + dpool_drain_all_pcp_pool(dpool); + else + dpool_wait_all_pcp_pool_unlock(dpool); +} + +static void dpool_enable_pcp_pool(struct dynamic_pool *dpool) +{ + atomic_dec(&dpool->pcp_refcnt); +} + +static bool dpool_pcp_enabled(struct dynamic_pool *dpool) +{ + return !atomic_read(&dpool->pcp_refcnt); +} + +static struct page *dpool_alloc_pcp_page(struct dynamic_pool *dpool) +{ + struct pcp_pages_pool *pcp_pool; + struct page *page = NULL; + unsigned long flags; + + pcp_pool = this_cpu_ptr(dpool->pcp_pool); + spin_lock_irqsave(&pcp_pool->lock, flags); + if (!dpool->online || !dpool_pcp_enabled(dpool)) + goto unlock; + +retry: + page = NULL; + if (!pcp_pool->free_pages) + dpool_refill_pcp_pool(dpool, pcp_pool, PCP_PAGE_BATCH); + + page = list_first_entry_or_null(&pcp_pool->freelist, struct page, lru); + if (!page) + goto unlock; + + list_del(&page->lru); + pcp_pool->free_pages--; + pcp_pool->used_pages++; + + if (check_new_page(page)) { + SetPagePool(page); + goto retry; + } + + SetPagePool(page); + +unlock: + spin_unlock_irqrestore(&pcp_pool->lock, flags); + + return page; +} + +static int dpool_free_pcp_page(struct dynamic_pool *dpool, struct page *page) +{ + struct pcp_pages_pool *pcp_pool; + unsigned long flags; + int ret = 0; + + pcp_pool = this_cpu_ptr(dpool->pcp_pool); + spin_lock_irqsave(&pcp_pool->lock, flags); + if (!dpool_pcp_enabled(dpool)) { + ret = -EINVAL; + goto unlock; + } + + ClearPagePool(page); + if (!free_pages_prepare(page, 0, 0)) { + SetPagePool(page); + goto unlock; + } + + list_add(&page->lru, &pcp_pool->freelist); + pcp_pool->free_pages++; + pcp_pool->used_pages--; + if (pcp_pool->free_pages > PCP_PAGE_MAX) + dpool_drain_pcp_pool(dpool, pcp_pool, PCP_PAGE_BATCH); + +unlock: + spin_unlock_irqrestore(&pcp_pool->lock, flags); + + return ret; +} + +/* === allocation interface =========================================== */ + +int dynamic_pool_can_attach(struct task_struct *tsk, struct mem_cgroup *memcg) +{ + struct dynamic_pool *src_dpool, *dst_dpool; + int ret = 0; + + if (!dpool_enabled) + return 0; + + src_dpool = dpool_get_from_task(tsk); + if (!src_dpool) + return 0; + + dst_dpool = dpool_get_from_memcg(memcg); + if (dst_dpool != src_dpool) + ret = -EPERM; + + dpool_put(src_dpool); + dpool_put(dst_dpool); + + return ret; +} + +static bool dpool_should_alloc(gfp_t gfp_mask, unsigned int order) +{ + gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE; + + if (current->flags & PF_KTHREAD) + return false; + + if (order != 0) + return false; + + /* + * The cgroup only charges anonymous and file pages from usespage. + * some filesystem maybe has masked out the __GFP_IO | __GFP_FS + * to avoid recursive memory request. eg: loop device, xfs. + */ + if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE) + return false; + + return true; +} + +struct page *dynamic_pool_alloc_page(gfp_t gfp, unsigned int order, + unsigned int alloc_flags) +{ + struct dynamic_pool *dpool; + struct pages_pool *pool; + struct page *page = NULL; + unsigned long flags; + + if (!dpool_enabled) + return NULL; + + if (!dpool_should_alloc(gfp, order)) + return NULL; + + dpool = dpool_get_from_task(current); + if (!dpool) + return NULL; + + page = dpool_alloc_pcp_page(dpool); + if (page) + goto put; + + pool = &dpool->pool[PAGES_POOL_4K]; + spin_lock_irqsave(&dpool->lock, flags); + if (!dpool->online) + goto unlock; + +retry: + page = NULL; + if (!pool->free_pages && dpool_demote_pool_locked(dpool, PAGES_POOL_2M)) { + spin_unlock_irqrestore(&dpool->lock, flags); + dpool_drain_all_pcp_pool(dpool); + spin_lock_irqsave(&dpool->lock, flags); + if (!dpool->online || !pool->free_pages) + goto unlock; + } + + page = list_first_entry_or_null(&pool->freelist, struct page, lru); + if (!page) + goto unlock; + + __ClearPageDpool(page); + list_del(&page->lru); + pool->free_pages--; + pool->used_pages++; + + if (check_new_page(page)) { + /* This is a bad page, treat it as a used pages */ + SetPagePool(page); + goto retry; + } + + SetPagePool(page); + +unlock: + spin_unlock_irqrestore(&dpool->lock, flags); +put: + dpool_put(dpool); + if (page) + prep_new_page(page, order, gfp, alloc_flags); + + return page; +} + +void dynamic_pool_free_page(struct page *page) +{ + struct dynamic_pool *dpool; + struct pages_pool *pool; + unsigned long flags; + + if (!dpool_enabled) + return; + + dpool = dpool_get_from_page(page); + if (!dpool) { + pr_err("get dpool failed when free page 0x%px\n", page); + return; + } + + if (!dpool_free_pcp_page(dpool, page)) + goto put; + + pool = &dpool->pool[PAGES_POOL_4K]; + spin_lock_irqsave(&dpool->lock, flags); + + ClearPagePool(page); + if (!free_pages_prepare(page, 0, 0)) { + SetPagePool(page); + goto unlock; + } + + __SetPageDpool(page); + list_add(&page->lru, &pool->freelist); + pool->free_pages++; + pool->used_pages--; + +unlock: + spin_unlock_irqrestore(&dpool->lock, flags); +put: + dpool_put(dpool); +} + +void dynamic_pool_bind_file(struct hugetlbfs_inode_info *p, struct hstate *h) +{ + unsigned long size; + + if (!dpool_enabled || !p) + return; + + size = huge_page_size(h); + if (size == PMD_SIZE || size == PUD_SIZE) + p->dpool = dpool_get_from_task(current); + else + p->dpool = NULL; +} + +void dynamic_pool_unbind_file(struct hugetlbfs_inode_info *p) +{ + struct dynamic_pool *dpool; + + if (!dpool_enabled || !p || !p->dpool) + return; + + dpool = p->dpool; + p->dpool = NULL; + dpool_put(dpool); +} + +int dynamic_pool_hugetlb_acct_memory(struct hstate *h, long delta, + struct hugetlbfs_inode_info *p) +{ + struct dynamic_pool *dpool; + struct pages_pool *pool; + unsigned long flags; + int type; + int ret = -ENOMEM; + + if (!dpool_enabled || !p || !p->dpool) + return 0; + + dpool = p->dpool; + spin_lock_irqsave(&dpool->lock, flags); + + if (hstate_is_gigantic(h)) + type = PAGES_POOL_1G; + else + type = PAGES_POOL_2M; + pool = &dpool->pool[type]; + + if (delta > 0) { + if (delta <= pool->free_huge_pages - pool->resv_huge_pages) { + pool->resv_huge_pages += delta; + ret = 0; + } + } else { + pool->resv_huge_pages -= (unsigned long)(-delta); + WARN_ON(pool->resv_huge_pages < 0); + ret = 0; + } + spin_unlock_irqrestore(&dpool->lock, flags); + trace_dpool_acct_memory(dpool, type, delta, pool->resv_huge_pages, + ret); + + return ret; +} + +struct folio *dynamic_pool_alloc_hugepage(struct hugetlbfs_inode_info *p, + struct hstate *h, bool reserved) +{ + struct dynamic_pool *dpool; + struct pages_pool *pool; + struct folio *folio = NULL; + unsigned long flags; + int type; + + if (!dpool_enabled) + return NULL; + + dpool = p->dpool; + if (!dpool) + return NULL; + + spin_lock_irqsave(&dpool->lock, flags); + if (!dpool->online) + goto unlock; + + if (hstate_is_gigantic(h)) + type = PAGES_POOL_1G; + else + type = PAGES_POOL_2M; + pool = &dpool->pool[type]; + + list_for_each_entry(folio, &pool->freelist, lru) { + if (folio_test_hwpoison(folio)) + continue; + + list_del(&folio->lru); + __folio_clear_dpool(folio); + folio_ref_unfreeze(folio, 1); + pool->free_huge_pages--; + pool->used_huge_pages++; + if (reserved) { + folio_set_hugetlb_restore_reserve(folio); + pool->resv_huge_pages--; + } + folio_set_pool(folio); + goto unlock; + } + folio = NULL; + +unlock: + spin_unlock_irqrestore(&dpool->lock, flags); + trace_dpool_alloc_hugepage(dpool, type, folio, pool->free_huge_pages, + pool->resv_huge_pages); + + return folio; +} + +void dynamic_pool_free_hugepage(struct folio *folio, bool restore_reserve) +{ + struct hstate *h = folio_hstate(folio); + struct dynamic_pool *dpool; + struct pages_pool *pool; + unsigned long flags; + int type; + + if (!dpool_enabled) + return; + + dpool = dpool_get_from_page(folio_page(folio, 0)); + if (!dpool) { + pr_err("get dpool failed when free hugepage 0x%px\n", folio); + return; + } + + spin_lock_irqsave(&dpool->lock, flags); + if (hstate_is_gigantic(h)) + type = PAGES_POOL_1G; + else + type = PAGES_POOL_2M; + pool = &dpool->pool[type]; + + if (folio_test_hwpoison(folio)) + goto unlock; + + folio_clear_pool(folio); + __folio_set_dpool(folio); + list_add(&folio->lru, &pool->freelist); + pool->free_huge_pages++; + pool->used_huge_pages--; + if (restore_reserve) + pool->resv_huge_pages++; + +unlock: + spin_unlock_irqrestore(&dpool->lock, flags); + dpool_put(dpool); + trace_dpool_free_hugepage(dpool, type, folio, pool->free_huge_pages, + pool->resv_huge_pages); +} + +/* === dynamic pool function ========================================== */ + +static void dpool_dump_child_memcg(struct mem_cgroup *memcg, void *message) +{ + struct mem_cgroup *root = (struct mem_cgroup *)message; + struct cgroup *cgrp; + + if (root == memcg) + return; + + cgrp = memcg->css.cgroup; + pr_err("child memcg exists: "); + pr_cont_cgroup_name(cgrp); + pr_cont("\n"); +} + +static struct dynamic_pool *dpool_create(struct mem_cgroup *memcg, + struct dynamic_pool_ops *ops) +{ + struct dynamic_pool *dpool; + int cpu; + int i; + + if (memcg_has_children(memcg)) { + pr_err("create failed, memcg has children\n"); + mem_cgroup_scan_cgroups(memcg, dpool_dump_child_memcg, memcg); + return NULL; + } + + dpool = kzalloc(sizeof(struct dynamic_pool), GFP_KERNEL); + if (!dpool) + return NULL; + + dpool->pcp_pool = alloc_percpu(struct pcp_pages_pool); + if (!dpool->pcp_pool) { + kfree(dpool); + return NULL; + } + + spin_lock_init(&dpool->lock); + refcount_set(&dpool->refcnt, 1); + dpool->memcg = memcg; + dpool->ops = ops; + atomic_set(&dpool->pcp_refcnt, 0); + + for (i = 0; i < PAGES_POOL_MAX; i++) { + INIT_LIST_HEAD(&dpool->pool[i].freelist); + INIT_LIST_HEAD(&dpool->pool[i].splitlist); + } + + for_each_possible_cpu(cpu) { + struct pcp_pages_pool *pcp_pool; + + pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); + spin_lock_init(&pcp_pool->lock); + INIT_LIST_HEAD(&pcp_pool->freelist); + pcp_pool->free_pages = 0; + pcp_pool->used_pages = 0; + } + + css_get(&memcg->css); + memcg->dpool = dpool; + dpool->online = true; + + return dpool; +} + +void dynamic_pool_inherit(struct mem_cgroup *parent, struct mem_cgroup *memcg) +{ + struct dynamic_pool *dpool; + + if (!dpool_enabled || !parent || !memcg) + return; + + dpool = dpool_get_from_memcg(parent); + memcg->dpool = dpool; + + /* Don't increase refcount for child memcg */ + dpool_put(dpool); +} + +int dynamic_pool_destroy(struct cgroup *cgrp, bool *clear_css_online) +{ + struct cgroup_subsys_state *css = cgrp->subsys[memory_cgrp_id]; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct dynamic_pool *dpool; + int ret = 0; + + if (!dpool_enabled || !memcg) + return 0; + + mutex_lock(&dpool_mutex); + dpool = dpool_get_from_memcg(memcg); + if (!dpool) + goto unlock; + + if (dpool->memcg != memcg) + goto put; + + /* A offline dpool is not allowed for allocation */ + dpool->online = false; + /* Disable pcp pool forever */ + dpool_disable_pcp_pool(dpool, true); + + /* + * Even if no process exists in the memory cgroup, some pages may + * still be occupied. Release these pages before restore pool. + */ + mem_cgroup_force_empty(dpool->memcg); + + BUG_ON(!dpool->ops->restore_pool); + ret = dpool->ops->restore_pool(dpool); + if (ret) { + pr_err("restore pool failed\n"); + goto put; + } + + BUG_ON(!dpool->ops->drain_pool); + ret = dpool->ops->drain_pool(dpool); + if (ret) { + pr_err("drain pool failed\n"); + goto put; + } + + memcg->dpool = NULL; + + /* Release the initial reference count */ + dpool_put(dpool); + + /* + * Since dpool is destroyed and the memcg will be freed then, + * clear CSS_ONLINE immediately to prevent race with create. + */ + if (cgrp->self.flags & CSS_ONLINE) { + cgrp->self.flags &= ~CSS_ONLINE; + *clear_css_online = true; + } + +put: + dpool_put(dpool); +unlock: + mutex_unlock(&dpool_mutex); + + return ret; +} + +static int __init dynamic_pool_init(void) +{ + if (!enable_dhugetlb && !enable_dpagelist) + return 0; + + if (enable_dhugetlb) { + unsigned long count, size; + + count = max_t(unsigned long, hugepage_index(max_pfn), + DEFAULT_PAGE_ARRAY_COUNT); + size = sizeof(struct dpool_page_array) + + count * sizeof(struct dynamic_pool *); + dpool_page_array = kzalloc(size, GFP_KERNEL); + if (!dpool_page_array) { + pr_err("init failed\n"); + return -ENOMEM; + } + + dpool_page_array->count = count; + } + + static_branch_enable(&dynamic_pool_key); + pr_info("enabled\n"); + + return 0; +} +subsys_initcall(dynamic_pool_init); + +/* === Dynamic hugetlb interface ====================================== */ + +static int __init dynamic_hugetlb_setup(char *buf) +{ + if (enable_dpagelist) + return 0; + + return kstrtobool(buf, &enable_dhugetlb); +} +early_param("dynamic_hugetlb", dynamic_hugetlb_setup); + +static int dpool_record_page(struct dynamic_pool *dpool, unsigned long idx) +{ + read_lock(&dpool_page_array_rwlock); + + /* + * If page's pfn is greater than dhugetlb_pagelist_t->count (which + * may occurs due to memory hotplug) then dhugetlb_pagelist_t need + * to be reallocated, so need write_lock here. + */ + if (idx >= dpool_page_array->count) { + unsigned long size; + struct dpool_page_array *tmp; + + read_unlock(&dpool_page_array_rwlock); + write_lock(&dpool_page_array_rwlock); + + size = sizeof(struct dpool_page_array) + + (idx + 1) * sizeof(struct dynamic_pool *); + tmp = krealloc(dpool_page_array, size, GFP_ATOMIC); + if (!tmp) { + write_unlock(&dpool_page_array_rwlock); + return -ENOMEM; + } + + tmp->count = idx + 1; + dpool_page_array = tmp; + + write_unlock(&dpool_page_array_rwlock); + read_lock(&dpool_page_array_rwlock); + } + dpool_page_array->dpool[idx] = dpool; + read_unlock(&dpool_page_array_rwlock); + + return 0; +} + +static int dpool_fill_from_hugetlb(struct dynamic_pool *dpool, void *arg) +{ + struct hstate *h = size_to_hstate(PUD_SIZE); + unsigned long nr_pages = *(unsigned long *)arg; + int nid = dpool->nid; + unsigned long count = 0; + struct pages_pool *pool = &dpool->pool[PAGES_POOL_1G]; + struct page *page, *next; + struct folio *folio; + unsigned long idx; + LIST_HEAD(page_list); + + if (!h) + return -EINVAL; + + spin_lock(&hugetlb_lock); + if ((h->free_huge_pages_node[nid] < nr_pages) || + (h->free_huge_pages - h->resv_huge_pages < nr_pages)) { + spin_unlock(&hugetlb_lock); + return -ENOMEM; + } + + while (count < nr_pages) { + folio = dequeue_hugetlb_folio_node_exact(h, nid); + if (!folio) + break; + page = folio_page(folio, 0); + /* dequeue will unfreeze the page, refreeze it. */ + page_ref_freeze(page, 1); + idx = hugepage_index(page_to_pfn(page)); + if (dpool_record_page(dpool, idx)) { + enqueue_hugetlb_folio(h, folio); + pr_err("dpool_page_array can't record page 0x%px\n", + page); + continue; + } + list_move(&page->lru, &page_list); + count++; + } + spin_unlock(&hugetlb_lock); + + list_for_each_entry_safe(page, next, &page_list, lru) { + if (hugetlb_vmemmap_restore(h, page)) { + spin_lock(&hugetlb_lock); + enqueue_hugetlb_folio(h, folio); + spin_unlock(&hugetlb_lock); + pr_err("restore hugetlb_vmemmap failed page 0x%px\n", + page); + continue; + } + + __SetPageDpool(page); + spin_lock(&dpool->lock); + list_move(&page->lru, &pool->freelist); + pool->free_pages++; + dpool->total_pages++; + spin_unlock(&dpool->lock); + } + + return 0; +} + +static int dpool_drain_to_hugetlb(struct dynamic_pool *dpool) +{ + struct hstate *h = size_to_hstate(PUD_SIZE); + struct pages_pool *pool = &dpool->pool[PAGES_POOL_1G]; + struct page *page, *next; + unsigned long idx; + LIST_HEAD(page_list); + + if (!h) + return -EINVAL; + + spin_lock(&dpool->lock); + list_for_each_entry_safe(page, next, &pool->freelist, lru) { + WARN_ON(PageHWPoison(page)); + idx = hugepage_index(page_to_pfn(page)); + WARN_ON(dpool_record_page(NULL, idx)); + + list_move(&page->lru, &page_list); + __ClearPageDpool(page); + pool->free_pages--; + dpool->total_pages--; + } + spin_unlock(&dpool->lock); + + list_for_each_entry_safe(page, next, &page_list, lru) { + hugetlb_vmemmap_optimize(h, page); + spin_lock(&hugetlb_lock); + enqueue_hugetlb_folio(h, page_folio(page)); + spin_unlock(&hugetlb_lock); + } + + return dpool->total_pages ? -ENOMEM : 0; +} + +static int dpool_merge_all(struct dynamic_pool *dpool) +{ + struct pages_pool *pool; + int ret = -ENOMEM; + + pool = &dpool->pool[PAGES_POOL_2M]; + while (pool->split_pages) { + cond_resched(); + ret = dpool_promote_pool(dpool, PAGES_POOL_2M); + if (ret) { + pr_err("some 4K pages can't merge ret: %d, delete failed: \n", + ret); + pr_cont_cgroup_name(dpool->memcg->css.cgroup); + pr_cont("\n"); + goto out; + } + } + + spin_lock(&dpool->lock); + if (pool->split_pages || pool->used_huge_pages || pool->resv_huge_pages) { + ret = -ENOMEM; + pr_err("some 2M pages are still in use or mmap, delete failed: "); + pr_cont_cgroup_name(dpool->memcg->css.cgroup); + pr_cont("\n"); + spin_unlock(&dpool->lock); + goto out; + } + + pool->free_pages += pool->nr_huge_pages; + pool->nr_huge_pages = 0; + pool->free_huge_pages = 0; + spin_unlock(&dpool->lock); + + pool = &dpool->pool[PAGES_POOL_1G]; + while (pool->split_pages) { + cond_resched(); + ret = dpool_promote_pool(dpool, PAGES_POOL_1G); + if (ret) { + pr_err("some 2M pages can't merge ret: %d, delete failed: \n", + ret); + pr_cont_cgroup_name(dpool->memcg->css.cgroup); + pr_cont("\n"); + goto out; + } + } + + spin_lock(&dpool->lock); + if (pool->split_pages || pool->used_huge_pages || pool->resv_huge_pages) { + ret = -ENOMEM; + pr_err("some 1G pages are still in use or mmap, delete failed: "); + pr_cont_cgroup_name(dpool->memcg->css.cgroup); + pr_cont("\n"); + spin_unlock(&dpool->lock); + goto out; + } + + pool->free_pages += pool->nr_huge_pages; + pool->nr_huge_pages = 0; + pool->free_huge_pages = 0; + spin_unlock(&dpool->lock); + ret = 0; + +out: + return ret; +} + +static struct dynamic_pool_ops hugetlb_dpool_ops = { + .fill_pool = dpool_fill_from_hugetlb, + .drain_pool = dpool_drain_to_hugetlb, + .restore_pool = dpool_merge_all, +}; + +/* If dynamic pool is disabled, hide the interface */ +bool dynamic_pool_hide_files(struct cftype *cft) +{ + if (dpool_enabled && enable_dhugetlb) + return false; + + return !!strstr(cft->name, "dhugetlb"); +} + +int dynamic_pool_add_memory(struct mem_cgroup *memcg, int nid, + unsigned long size) +{ + struct dynamic_pool *dpool; + int ret = -EINVAL; + bool new_create = false; + + if (!dpool_enabled) + return -EINVAL; + + mutex_lock(&dpool_mutex); + + if (!(memcg->css.cgroup->self.flags & CSS_ONLINE)) { + pr_err("add memory failed, memcg is going offline\n"); + goto unlock; + } + + dpool = memcg->dpool; + if (!dpool) { + dpool = dpool_create(memcg, &hugetlb_dpool_ops); + if (!dpool) + goto unlock; + + dpool->nid = nid; + new_create = true; + } else if (dpool->memcg != memcg) { + pr_err("add memory failed, not parent memcg\n"); + goto unlock; + } else if (dpool->nid != nid) { + pr_err("add memory failed, not target nid(%d)\n", + dpool->nid); + goto unlock; + } + + BUG_ON(!dpool->ops->fill_pool); + ret = dpool->ops->fill_pool(dpool, &size); + if (ret) { + pr_err("fill pool failed\n"); + /* + * If create a new hpool here but add memory failed, + * release it directly here. + */ + if (new_create) { + memcg->dpool = NULL; + dpool_put(dpool); + } + } + +unlock: + mutex_unlock(&dpool_mutex); + + return ret; +} + +void dynamic_pool_show(struct mem_cgroup *memcg, struct seq_file *m) +{ + struct dynamic_pool *dpool; + unsigned long free_pages; + long used_pages; + + if (!dpool_enabled || !memcg) + return; + + dpool = dpool_get_from_memcg(memcg); + if (!dpool) { + seq_puts(m, "Current hierarchial have not memory pool.\n"); + return; + } + + dpool_disable_pcp_pool(dpool, false); + spin_lock(&dpool->lock); + + /* + * no others can modify the count because pcp pool is disabled and + * dpool->lock is locked. + */ + dpool_sum_pcp_pool(dpool, &free_pages, &used_pages); + free_pages += dpool->pool[PAGES_POOL_4K].free_pages; + used_pages += dpool->pool[PAGES_POOL_4K].used_pages; + + seq_printf(m, "nid %d\n", dpool->nid); + seq_printf(m, "dhugetlb_total_pages %lu\n", dpool->total_pages); + seq_printf(m, "1G_total_reserved_pages %lu\n", + dpool->pool[PAGES_POOL_1G].nr_huge_pages); + seq_printf(m, "1G_free_reserved_pages %lu\n", + dpool->pool[PAGES_POOL_1G].free_huge_pages); + seq_printf(m, "1G_mmap_reserved_pages %lu\n", + dpool->pool[PAGES_POOL_1G].resv_huge_pages); + seq_printf(m, "1G_used_pages %lu\n", + dpool->pool[PAGES_POOL_1G].used_huge_pages); + seq_printf(m, "2M_total_reserved_pages %lu\n", + dpool->pool[PAGES_POOL_2M].nr_huge_pages); + seq_printf(m, "2M_free_reserved_pages %lu\n", + dpool->pool[PAGES_POOL_2M].free_huge_pages); + seq_printf(m, "2M_mmap_reserved_pages %lu\n", + dpool->pool[PAGES_POOL_2M].resv_huge_pages); + seq_printf(m, "2M_used_pages %lu\n", + dpool->pool[PAGES_POOL_2M].used_huge_pages); + seq_printf(m, "1G_free_unreserved_pages %lu\n", + dpool->pool[PAGES_POOL_1G].free_pages); + seq_printf(m, "2M_free_unreserved_pages %lu\n", + dpool->pool[PAGES_POOL_2M].free_pages); + seq_printf(m, "4K_free_pages %lu\n", free_pages); + seq_printf(m, "4K_used_pages %ld\n", used_pages); + + spin_unlock(&dpool->lock); + dpool_enable_pcp_pool(dpool); + dpool_put(dpool); +} + +int dynamic_pool_reserve_hugepage(struct mem_cgroup *memcg, + unsigned long nr_pages, int type) +{ + struct dynamic_pool *dpool; + struct pages_pool *pool; + unsigned long delta; + int ret = -EINVAL; + + if (!dpool_enabled) + return -EINVAL; + + mutex_lock(&dpool_mutex); + + dpool = dpool_get_from_memcg(memcg); + if (!dpool) + goto unlock; + + pool = &dpool->pool[type]; + spin_lock(&dpool->lock); + if (nr_pages > pool->nr_huge_pages) { + delta = nr_pages - pool->nr_huge_pages; + while (delta > pool->free_pages && + !dpool_demote_pool_locked(dpool, type - 1)) + cond_resched_lock(&dpool->lock); + /* Only try merge pages for 2M pages */ + if (type == PAGES_POOL_2M) { + while (delta > pool->free_pages) { + spin_unlock(&dpool->lock); + cond_resched(); + if (dpool_promote_pool(dpool, type)) { + spin_lock(&dpool->lock); + break; + } + spin_lock(&dpool->lock); + } + } + delta = min(delta, pool->free_pages); + pool->nr_huge_pages += delta; + pool->free_huge_pages += delta; + pool->free_pages -= delta; + } else { + delta = min(pool->nr_huge_pages - nr_pages, + pool->free_huge_pages - pool->resv_huge_pages); + pool->nr_huge_pages -= delta; + pool->free_huge_pages -= delta; + pool->free_pages += delta; + } + spin_unlock(&dpool->lock); + dpool_put(dpool); + ret = 0; + +unlock: + mutex_unlock(&dpool_mutex); + + return ret; +} + +/* === Dynamic pagelist interface ===================================== */ + +static int __init dynamic_pagelist_setup(char *buf) +{ + if (enable_dhugetlb) + return 0; + + return kstrtobool(buf, &enable_dpagelist); +} +early_param("dpool", dynamic_pagelist_setup); + +static int dpool_fill_from_pagelist(struct dynamic_pool *dpool, void *arg) +{ + struct dpool_info *info = (struct dpool_info *)arg; + struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; + int i, ret = -EINVAL; + + dpool->range_cnt = info->range_cnt; + dpool->pfn_ranges = + kmalloc_array(info->range_cnt, sizeof(struct range), GFP_KERNEL); + if (!dpool->pfn_ranges) + return -ENOMEM; + + memcpy(dpool->pfn_ranges, info->pfn_ranges, + sizeof(struct range) * dpool->range_cnt); + + spin_lock(&dpool->lock); + + for (i = 0; i < dpool->range_cnt; i++) { + struct range *range = &dpool->pfn_ranges[i]; + u64 pfn; + + for (pfn = range->start; pfn <= range->end; pfn++) { + struct page *page = pfn_to_page(pfn); + + set_page_count(page, 0); + page_mapcount_reset(page); + + if (!free_pages_prepare(page, 0, 0)) { + pr_err("fill pool failed, check pages failed\n"); + goto unlock; + } + + __SetPageDpool(page); + list_add_tail(&page->lru, &pool->freelist); + pool->free_pages++; + + cond_resched_lock(&dpool->lock); + } + } + ret = 0; + +unlock: + spin_unlock(&dpool->lock); + + return ret; +} + +static int dpool_drain_to_pagelist(struct dynamic_pool *dpool) +{ + struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; + + /* check poisoned pages */ + return (pool->used_pages == dpool->nr_poisoned_pages) ? 0 : -ENOMEM; +} + +static int dpool_migrate_used_pages(struct dynamic_pool *dpool) +{ + int range_cnt = dpool->range_cnt; + int i; + + spin_lock(&dpool->lock); + + dpool->nr_poisoned_pages = 0; + for (i = 0; i < range_cnt; i++) { + struct range *range = &dpool->pfn_ranges[i]; + u64 pfn; + + for (pfn = range->start; pfn <= range->end; pfn++) { + struct page *page = pfn_to_page(pfn); + + /* Unlock and try migration. */ + spin_unlock(&dpool->lock); + cond_resched(); + + if (PageDpool(page)) { + spin_lock(&dpool->lock); + continue; + } + + if (PageHWPoison(page)) + dpool->nr_poisoned_pages++; + + lru_add_drain_all(); + do_migrate_range(pfn, pfn + 1); + spin_lock(&dpool->lock); + } + } + + spin_unlock(&dpool->lock); + + return 0; +} + +struct dynamic_pool_ops pagelist_dpool_ops = { + .fill_pool = dpool_fill_from_pagelist, + .drain_pool = dpool_drain_to_pagelist, + .restore_pool = dpool_migrate_used_pages, +}; + +int dpool_init(struct dpool_info *arg) +{ + struct dynamic_pool *dpool; + int ret; + + if (!dpool_enabled) + return -EINVAL; + + if (!arg || !arg->memcg || arg->range_cnt <= 0) { + pr_err("init failed, arg is invalid\n"); + return -EINVAL; + } + + mutex_lock(&dpool_mutex); + + if (dpool_global_pool || arg->memcg->dpool) { + pr_err("init failed, dpool is already exist\n"); + ret = -EINVAL; + goto unlock; + } + + if (!(arg->memcg->css.cgroup->self.flags & CSS_ONLINE)) { + pr_err("init failed, memcg is not online\n"); + ret = -EINVAL; + goto unlock; + } + + dpool = dpool_create(arg->memcg, &pagelist_dpool_ops); + if (!dpool) { + pr_err("init failed, create failed. ret: %d\n", ret); + ret = -ENOMEM; + goto unlock; + } + + dpool_global_pool = dpool; + + BUG_ON(!dpool->ops->fill_pool); + ret = dpool->ops->fill_pool(dpool, arg); + if (ret) + dpool_put(dpool); + +unlock: + mutex_unlock(&dpool_mutex); + + return ret; +} + +void dynamic_pool_show_meminfo(struct seq_file *m) +{ + struct dynamic_pool *dpool; + struct pages_pool *pool; + unsigned long free_pages = 0; + long used_pages = 0; + + if (!dpool_enabled || !enable_dpagelist) + return; + + dpool = dpool_get_from_page(NULL); + if (!dpool) + goto out; + + pool = &dpool->pool[PAGES_POOL_4K]; + dpool_disable_pcp_pool(dpool, false); + spin_lock(&dpool->lock); + dpool_sum_pcp_pool(dpool, &free_pages, &used_pages); + free_pages += pool->free_pages; + used_pages += pool->used_pages; + spin_unlock(&dpool->lock); + dpool_enable_pcp_pool(dpool); + +out: + if (m) { + seq_printf(m, + "DPoolTotal: %8lu kB\n" + "DPoolFree: %8ld kB\n", + (free_pages + used_pages) << (PAGE_SHIFT - 10), + free_pages << (PAGE_SHIFT - 10)); + } else { + pr_info("DPoolTotal: %lu kB\n", + (free_pages + used_pages) << (PAGE_SHIFT - 10)); + pr_info("DPoolFree: %ld kB\n", free_pages << (PAGE_SHIFT - 10)); + } + + dpool_put(dpool); +} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 825d1a18d27db0d8631f84b380d50353ac408357..ec33252edb8df33c33e3292881d822d9f5307fa1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -92,7 +93,8 @@ static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ -static int hugetlb_acct_memory(struct hstate *h, long delta); +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct hugetlbfs_inode_info *info); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); @@ -123,7 +125,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, - -spool->min_hpages); + -spool->min_hpages, NULL); kfree(spool); } } @@ -143,7 +145,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, spool->hstate = h; spool->min_hpages = min_hpages; - if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages, NULL)) { kfree(spool); return NULL; } @@ -171,13 +173,16 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct hugetlbfs_inode_info *info) { long ret = delta; if (!spool) return ret; + if (file_in_dynamic_pool(info)) + return ret; + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ @@ -216,7 +221,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct hugetlbfs_inode_info *info) { long ret = delta; unsigned long flags; @@ -224,6 +229,9 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, if (!spool) return delta; + if (file_in_dynamic_pool(info)) + return ret; + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ @@ -935,14 +943,15 @@ static long region_del(struct resv_map *resv, long f, long t) void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); long rsv_adjust; bool reserved = false; - rsv_adjust = hugepage_subpool_get_pages(spool, 1); + rsv_adjust = hugepage_subpool_get_pages(spool, 1, info); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - if (!hugetlb_acct_memory(h, 1)) + if (!hugetlb_acct_memory(h, 1, info)) reserved = true; } else if (!rsv_adjust) { reserved = true; @@ -1315,7 +1324,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) return false; } -static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) +void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1328,8 +1337,7 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) folio_set_hugetlb_freed(folio); } -static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, - int nid) +struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid) { struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); @@ -1539,7 +1547,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, __folio_clear_head(folio); } -static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, +void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, unsigned int order) { __destroy_compound_gigantic_folio(folio, order, true); @@ -1929,7 +1937,7 @@ void free_huge_folio(struct folio *folio) * after page is free. Therefore, force restore_reserve * operation. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) + if (hugepage_subpool_put_pages(spool, 1, NULL) == 0) restore_reserve = true; } @@ -1939,6 +1947,14 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + + if (page_from_dynamic_pool(folio_page(folio, 0))) { + list_del(&folio->lru); + spin_unlock_irqrestore(&hugetlb_lock, flags); + dynamic_pool_free_hugepage(folio, restore_reserve); + return; + } + if (restore_reserve) h->resv_huge_pages++; @@ -1968,7 +1984,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); @@ -2069,7 +2085,7 @@ static bool prep_compound_gigantic_folio(struct folio *folio, return __prep_compound_gigantic_folio(folio, order, false); } -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, +bool prep_compound_gigantic_folio_for_demote(struct folio *folio, unsigned int order) { return __prep_compound_gigantic_folio(folio, order, true); @@ -2383,6 +2399,9 @@ int dissolve_free_huge_page(struct page *page) if (!folio_test_hugetlb(folio)) return 0; + if (page_from_dynamic_pool(page) || page_in_dynamic_pool(page)) + return -EBUSY; + spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) { rc = 0; @@ -3085,6 +3104,9 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) struct folio *folio = page_folio(page); int ret = -EBUSY; + if (page_from_dynamic_pool(page) || page_in_dynamic_pool(page)) + return -EBUSY; + /* * The page might have been dissolved from under our feet, so make sure * to carefully check the state under the lock. @@ -3119,6 +3141,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); struct hstate *h = hstate_vma(vma); struct folio *folio; long map_chg, map_commit; @@ -3145,7 +3168,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * checked against any subpool limit. */ if (map_chg || avoid_reserve) { - gbl_chg = hugepage_subpool_get_pages(spool, 1); + gbl_chg = hugepage_subpool_get_pages(spool, 1, info); if (gbl_chg < 0) { vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); @@ -3177,6 +3200,19 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; + if (file_in_dynamic_pool(info)) { + bool reserved = false; + + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + reserved = true; + folio = dynamic_pool_alloc_hugepage(info, h, reserved); + if (!folio) + goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); + list_add(&folio->lru, &h->hugepage_activelist); + goto out; + } + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken @@ -3199,6 +3235,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, /* Fall through */ } +out: hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. @@ -3225,8 +3262,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); + hugetlb_acct_memory(h, -rsv_adjust, info); if (deferred_reserve) hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); @@ -3241,7 +3278,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, h_cg); out_subpool_put: if (map_chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); + hugepage_subpool_put_pages(spool, 1, info); vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -4928,13 +4965,17 @@ unsigned long hugetlb_total_pages(void) return nr_total_pages; } -static int hugetlb_acct_memory(struct hstate *h, long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct hugetlbfs_inode_info *info) { int ret = -ENOMEM; if (!delta) return 0; + if (file_in_dynamic_pool(info)) + return dynamic_pool_hugetlb_acct_memory(h, delta, info); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page @@ -5021,6 +5062,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct hstate *h = hstate_vma(vma); struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); + struct hugetlbfs_inode_info *info; unsigned long reserve, start, end; long gbl_reserve; @@ -5040,8 +5082,9 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ - gbl_reserve = hugepage_subpool_put_pages(spool, reserve); - hugetlb_acct_memory(h, -gbl_reserve); + info = HUGETLBFS_I(file_inode(vma->vm_file)); + gbl_reserve = hugepage_subpool_put_pages(spool, reserve, info); + hugetlb_acct_memory(h, -gbl_reserve, info); } kref_put(&resv->refs, resv_map_release); @@ -6865,6 +6908,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct resv_map *resv_map; struct hugetlb_cgroup *h_cg = NULL; long gbl_reserve, regions_needed = 0; @@ -6935,7 +6979,7 @@ bool hugetlb_reserve_pages(struct inode *inode, * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ - gbl_reserve = hugepage_subpool_get_pages(spool, chg); + gbl_reserve = hugepage_subpool_get_pages(spool, chg, info); if (gbl_reserve < 0) goto out_uncharge_cgroup; @@ -6943,7 +6987,7 @@ bool hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - if (hugetlb_acct_memory(h, gbl_reserve) < 0) + if (hugetlb_acct_memory(h, gbl_reserve, info) < 0) goto out_put_pages; /* @@ -6961,7 +7005,7 @@ bool hugetlb_reserve_pages(struct inode *inode, add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { - hugetlb_acct_memory(h, -gbl_reserve); + hugetlb_acct_memory(h, -gbl_reserve, info); goto out_put_pages; } else if (unlikely(chg > add)) { /* @@ -6982,8 +7026,8 @@ bool hugetlb_reserve_pages(struct inode *inode, (chg - add) * pages_per_huge_page(h), h_cg); rsv_adjust = hugepage_subpool_put_pages(spool, - chg - add); - hugetlb_acct_memory(h, -rsv_adjust); + chg - add, info); + hugetlb_acct_memory(h, -rsv_adjust, info); } else if (h_cg) { /* * The file_regions will hold their own reference to @@ -6998,7 +7042,7 @@ bool hugetlb_reserve_pages(struct inode *inode, out_put_pages: /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + (void)hugepage_subpool_put_pages(spool, chg, info); out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -7024,6 +7068,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); long gbl_reserve; /* @@ -7052,8 +7097,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, * Note that !resv_map implies freed == 0. So (chg - freed) * won't go negative. */ - gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed), info); + hugetlb_acct_memory(h, -gbl_reserve, info); return 0; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index b5a8587a0bad810aa0d555eb89cb4b30d85545f7..90aee2cd3dab302a21d400ba13425c1fbc3edbca 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include "hugetlb_vmemmap.h" @@ -488,6 +489,9 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h if (!hugetlb_vmemmap_optimizable(h)) return false; + if (page_in_dynamic_pool((struct page *)head)) + return false; + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { pmd_t *pmdp, pmd; struct page *vmemmap_page; diff --git a/mm/internal.h b/mm/internal.h index 1ebba69437d6f791994c7c447f1e176f0d7441b0..f4416fcbae782bfe1b256c07b523326e87ff3275 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -434,10 +434,16 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) set_page_private(p, 0); } +typedef int __bitwise fpi_t; extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); +extern bool free_pages_prepare(struct page *page, unsigned int order, + fpi_t fpi_flags); +extern int check_new_page(struct page *page); +extern void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags); extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f2787d3b682072aa74cd712d4f31240c76485b3..50babcbf11cea7b40484ffbce74a21f48386dd1c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -361,6 +362,10 @@ int hugepage_madvise(struct vm_area_struct *vma, if (mm_has_pgste(vma->vm_mm)) return 0; #endif + + if (task_in_dynamic_pool(current)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* @@ -1369,6 +1374,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } + if (page_from_dynamic_pool(page)) { + result = SCAN_FAIL; + goto out_unmap; + } + /* * Check if the page has any GUP (or other external) pins. * @@ -2296,6 +2306,11 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, break; } + if (page_from_dynamic_pool(page)) { + result = SCAN_FAIL; + break; + } + /* * We probably should check if the page is referenced here, but * nobody would transfer pte_young() to PageReferenced() for us. @@ -2726,6 +2741,9 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return -EINVAL; + if (task_in_dynamic_pool(current)) + return -EINVAL; + cc = kmalloc(sizeof(*cc), GFP_KERNEL); if (!cc) return -ENOMEM; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b037c313f4a4650516840bc5d8d54aa29d496838..717b88092f769728299ae482d86b0239bf4835df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,6 +66,7 @@ #include #include #include +#include #ifdef CONFIG_MEMCG_SWAP_QOS #include @@ -1310,6 +1311,24 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } } +/** + * mem_cgroup_scan_cgroups - iterate over memcgs of a memory cgroup hierarchy + * @memcg: hierarchy root + * @fn: function to call for each memcg + * @arg: argument passed to @fn + * + * This function iterates over memcg attached to @memcg or to any of its + * descendants and calls @fn for each memcgs. + */ +void mem_cgroup_scan_cgroups(struct mem_cgroup *memcg, + void (*fn)(struct mem_cgroup *, void *), void *arg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + fn(iter, arg); +} + #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { @@ -3710,7 +3729,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * * Caller is responsible for holding css reference for memcg. */ -static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +int mem_cgroup_force_empty(struct mem_cgroup *memcg) { int nr_retries = MAX_RECLAIM_RETRIES; @@ -5730,6 +5749,81 @@ static ssize_t memory_ksm_write(struct kernfs_open_file *of, char *buf, } #endif /* CONFIG_KSM */ +#ifdef CONFIG_DYNAMIC_POOL +static ssize_t mem_cgroup_dpool_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long size; + int nid; + char *endp; + int ret = -EINVAL; + + buf = strstrip(buf); + nid = memparse(buf, &endp); + if (*endp != ' ') + goto out; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)) + goto out; + + buf = endp + 1; + size = memparse(buf, &endp); + if (*endp != '\0' || size == 0) + goto out; + + ret = dynamic_pool_add_memory(memcg, nid, size); + +out: + return ret ? : nbytes; +} + +static int mem_cgroup_dpool_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + dynamic_pool_show(memcg, m); + + return 0; +} + +static ssize_t mem_cgroup_dpool_1G_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + char *endp; + unsigned long nr_pages; + int ret; + + buf = strstrip(buf); + nr_pages = memparse(buf, &endp); + if (*endp != '\0') + return -EINVAL; + + ret = dynamic_pool_reserve_hugepage(memcg, nr_pages, PAGES_POOL_1G); + + return ret ? : nbytes; +} + +static ssize_t mem_cgroup_dpool_2M_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + char *endp; + unsigned long nr_pages; + int ret; + + buf = strstrip(buf); + nr_pages = memparse(buf, &endp); + if (*endp != '\0') + return -EINVAL; + + ret = dynamic_pool_reserve_hugepage(memcg, nr_pages, PAGES_POOL_2M); + + return ret ? : nbytes; +} +#endif + static int memory_stat_show(struct seq_file *m, void *v); #ifdef CONFIG_MEMCG_V1_RECLAIM @@ -6090,6 +6184,24 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_swapfile_write, .seq_show = memcg_swapfile_read, }, +#endif +#ifdef CONFIG_DYNAMIC_POOL + { + .name = "dhugetlb.nr_pages", + .write = mem_cgroup_dpool_write, + .seq_show = mem_cgroup_dpool_read, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.1G.reserved_pages", + .write = mem_cgroup_dpool_1G_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.2M.reserved_pages", + .write = mem_cgroup_dpool_2M_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, #endif { }, /* terminate */ }; @@ -6335,6 +6447,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); memcg_swap_device_init(memcg, parent); + dynamic_pool_inherit(parent, memcg); } else { init_memcg_events(); page_counter_init(&memcg->memory, NULL); @@ -7125,6 +7238,10 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) if (!p) return 0; + ret = dynamic_pool_can_attach(p, memcg); + if (ret) + return ret; + /* * We are now committed to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4d6e43c88489a0ef1755656a7512577902374496..0b59cf8c544a8102e3a9b7484d4a997ae9f426b5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" #include "ras/ras_event.h" @@ -893,6 +894,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", + [MF_MSG_FREE_DPOOL] = "free dynamic pool page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1376,7 +1378,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) return true; - return PageLRU(page) || is_free_buddy_page(page); + return PageLRU(page) || is_free_buddy_page(page) || + page_in_dynamic_pool(page); } static int __get_hwpoison_page(struct page *page, unsigned long flags) @@ -1432,7 +1435,8 @@ static int get_any_page(struct page *p, unsigned long flags) if (pass++ < 3) goto try_again; ret = -EBUSY; - } else if (!PageHuge(p) && !is_free_buddy_page(p)) { + } else if (!PageHuge(p) && !is_free_buddy_page(p) && + !page_in_dynamic_pool(p)) { /* We raced with put_page, retry. */ if (pass++ < 3) goto try_again; @@ -1983,6 +1987,8 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, count_increased = true; } else if (folio_test_hugetlb_freed(folio)) { ret = 0; + } else if (page_in_dynamic_pool(folio_page(folio, 0))) { + ret = 0; } else if (folio_test_hugetlb_migratable(folio)) { ret = folio_try_get(folio); if (ret) @@ -2071,6 +2077,8 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; + } else if (page_in_dynamic_pool(p)) { + return action_result(pfn, MF_MSG_FREE_DPOOL, MF_RECOVERED); } else { res = MF_FAILED; } @@ -2226,6 +2234,8 @@ int memory_failure(unsigned long pfn, int flags) * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. + * 3) it's a free page in dynamic pool, and therefore in safe hand: + * check_new_page() will be the gate keeper. * In fact it's dangerous to directly bump up page count from 0, * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ @@ -2246,6 +2256,8 @@ int memory_failure(unsigned long pfn, int flags) res = MF_FAILED; } res = action_result(pfn, MF_MSG_BUDDY, res); + } else if (page_in_dynamic_pool(p)) { + res = action_result(pfn, MF_MSG_FREE_DPOOL, MF_RECOVERED); } else { res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); } diff --git a/mm/memory.c b/mm/memory.c index 944c2ce2756b11b98873a255a801ff5758c1373a..22e0150acec4a712cfd462b442f11d4b5da73d53 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -78,6 +78,7 @@ #include #include #include +#include #include @@ -5107,7 +5108,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true) && + !task_in_dynamic_pool(current)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5141,7 +5143,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true) && + !task_in_dynamic_pool(current)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 144758820e3a3c314f0ecd20b3694fc2106928f5..6f949d1b2eb0213b554f51089d3a66a1546d2c89 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1698,7 +1698,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, return 0; } -static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page, *head; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 42e434224e6773d566a03a1558b56bd1c6d1f2b2..da0ac870a3a98896b713df6f56ee3920106cf7f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include "internal.h" #include "shuffle.h" @@ -1073,7 +1074,7 @@ static void kernel_init_pages(struct page *page, int numpages) kasan_enable_current(); } -static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order, fpi_t fpi_flags) { int bad = 0; @@ -1426,7 +1427,7 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static int check_new_page(struct page *page) +int check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) @@ -1538,8 +1539,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, page_table_check_alloc(page, order); } -static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned int alloc_flags) +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags); @@ -2484,6 +2485,11 @@ void free_unref_page(struct page *page, unsigned int order) unsigned long pfn = page_to_pfn(page); int migratetype, pcpmigratetype; + if (page_from_dynamic_pool(page)) { + dynamic_pool_free_page(page); + return; + } + if (!free_unref_page_prepare(page, pfn, order)) return; @@ -2530,6 +2536,13 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_to_pfn(page); + + if (page_from_dynamic_pool(page)) { + list_del(&page->lru); + dynamic_pool_free_page(page); + continue; + } + if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); continue; @@ -4734,6 +4747,13 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, */ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); + /* Before alloc from buddy system, alloc from dpool firstly */ + if (dpool_enabled) { + page = dynamic_pool_alloc_page(alloc_gfp, order, alloc_flags); + if (page) + goto out; + } + /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page)) diff --git a/mm/shmem.c b/mm/shmem.c index b44bfad90f8de64d8939652846fa532d382db45f..de9a884b10e8fd7e224e606c7c303a550aedc4e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -79,6 +79,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include @@ -2019,6 +2020,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (!shmem_is_huge(inode, index, false, vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) goto alloc_nohuge; + if (task_in_dynamic_pool(current)) + goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); diff --git a/mm/show_mem.c b/mm/show_mem.c index 5604925fb0b42515183efe11e33a09857aaa7b18..1aa5cf26221f86cda02bdd266a43059e721a8504 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #include "swap.h" @@ -427,4 +428,5 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif reliable_report_meminfo(NULL); + dynamic_pool_show_meminfo(NULL); } diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 8d5595b6c59f848fa560becf070f282815e1dcd5..e48d85fc7951514d10942387a2bd4708484b3aa9 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -126,6 +126,7 @@ static const char * const page_flag_names[] = { [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", + [KPF_POOL] = "p:pool", [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked",