diff --git a/arch/arm64/configs/anolis-debug_defconfig b/arch/arm64/configs/anolis-debug_defconfig index 2d548d6f3794f5570335a0e583a3d84cdfb427b4..52a91a051c50272f03574440038fc6cbfb98ce61 100644 --- a/arch/arm64/configs/anolis-debug_defconfig +++ b/arch/arm64/configs/anolis-debug_defconfig @@ -1051,6 +1051,7 @@ CONFIG_DAMON_DBGFS=y # end of Data Access Monitoring CONFIG_FAST_COPY_MM=y +# CONFIG_PAGE_PREZERO is not set # end of Memory Management options CONFIG_NET=y diff --git a/arch/arm64/configs/anolis_defconfig b/arch/arm64/configs/anolis_defconfig index 7474f18539995aa1102416fb3afd5f02abcf11ec..0e31f05062f8ac6c53893ebcdfe770d6ba018c02 100644 --- a/arch/arm64/configs/anolis_defconfig +++ b/arch/arm64/configs/anolis_defconfig @@ -1070,6 +1070,7 @@ CONFIG_DAMON_DBGFS=y # end of Data Access Monitoring CONFIG_FAST_COPY_MM=y +# CONFIG_PAGE_PREZERO is not set # end of Memory Management options CONFIG_NET=y diff --git a/arch/x86/configs/anolis-debug_defconfig b/arch/x86/configs/anolis-debug_defconfig index 2b30caf97b5b806d84d1bff7b382ed697fc90554..d20981d39318665d0a60b3722e20d613a53d2f46 100644 --- a/arch/x86/configs/anolis-debug_defconfig +++ b/arch/x86/configs/anolis-debug_defconfig @@ -1066,6 +1066,7 @@ CONFIG_DAMON_DBGFS=y # end of Data Access Monitoring CONFIG_FAST_COPY_MM=y +CONFIG_PAGE_PREZERO=y # end of Memory Management options CONFIG_NET=y diff --git a/arch/x86/configs/anolis_defconfig b/arch/x86/configs/anolis_defconfig index 55fd36474288260db2ec6f26af2b27cbac42e239..041718400cb236063b22e11cde789ab67f25a577 100644 --- a/arch/x86/configs/anolis_defconfig +++ b/arch/x86/configs/anolis_defconfig @@ -1065,6 +1065,7 @@ CONFIG_DAMON_DBGFS=y # end of Data Access Monitoring CONFIG_FAST_COPY_MM=y +CONFIG_PAGE_PREZERO=y # end of Memory Management options CONFIG_NET=y diff --git a/drivers/base/node.c b/drivers/base/node.c index 7f35dc2cea5139359fff13103ca732057321325d..47a022f945addc2552bcd56450b90912f663bb17 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -439,6 +439,9 @@ static ssize_t node_read_meminfo(struct device *dev, #endif #ifdef CONFIG_DUPTEXT "Node %d DupText: %8lu kB\n" +#endif +#ifdef CONFIG_PAGE_PREZERO + "Node %d MemZeroed: %8lu kB\n" #endif , nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), @@ -476,6 +479,10 @@ static ssize_t node_read_meminfo(struct device *dev, #ifdef CONFIG_DUPTEXT , nid, K(node_page_state(pgdat, NR_DUPTEXT)) +#endif +#ifdef CONFIG_PAGE_PREZERO + , + nid, K(sum_zone_node_page_state(nid, NR_ZEROED_PAGES)) #endif ); len += hugetlb_report_node_meminfo(buf, len, nid); diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c index 04511a5be03e7afdd8ce503543d2e5aba6fbcaa6..25e3cca9ce8542ceffda8a753c5cfe7f0997f454 100644 --- a/drivers/dma/idxd/dma.c +++ b/drivers/dma/idxd/dma.c @@ -73,13 +73,25 @@ void idxd_dma_complete_txd(struct idxd_desc *desc, idxd_free_desc(desc->wq, desc); } -static void op_flag_setup(unsigned long flags, u32 *desc_flags) +static inline void op_control_flag_setup(unsigned long flags, u32 *desc_flags) { *desc_flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; if (flags & DMA_PREP_INTERRUPT) *desc_flags |= IDXD_OP_FLAG_RCI; } +static inline void op_mem_flag_setup(unsigned long flags, u32 *desc_flags) +{ + if (!(flags & DMA_PREP_NONTEMPORAL)) + *desc_flags |= IDXD_OP_FLAG_CC; +} + +static inline void op_flag_setup(unsigned long flags, u32 *desc_flags) +{ + op_control_flag_setup(flags, desc_flags); + op_mem_flag_setup(flags, desc_flags); +} + static inline void set_completion_address(struct idxd_desc *desc, u64 *compl_addr) { @@ -272,6 +284,47 @@ idxd_dma_prep_memcpy_sg(struct dma_chan *chan, return &desc->txd; } +static struct dma_async_tx_descriptor * +idxd_dma_prep_memset(struct dma_chan *c, dma_addr_t dma_dest, int value, + size_t len, unsigned long flags) +{ + struct idxd_wq *wq = to_idxd_wq(c); + u32 desc_flags; + struct idxd_desc *desc; + u64 pattern = 0; + + if (wq->state != IDXD_WQ_ENABLED) + return NULL; + + if (len > wq->max_xfer_bytes) + return NULL; + + op_flag_setup(flags, &desc_flags); + desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK); + if (IS_ERR(desc)) + return NULL; + + /* + * The dmaengine API provides an int 'value', but it is really an 8bit + * pattern. DSA supports a 64bit pattern, and therefore the 8bit pattern + * will be replicated to 64bits. + */ + if (value) { + pattern = value & 0xff; + pattern |= pattern << 8; + pattern |= pattern << 16; + pattern |= pattern << 32; + } + + idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_MEMFILL, + pattern, dma_dest, len, desc->compl_dma, + desc_flags); + + desc->txd.flags = flags; + + return &desc->txd; +} + static int idxd_dma_alloc_chan_resources(struct dma_chan *chan) { struct idxd_wq *wq = to_idxd_wq(chan); @@ -435,6 +488,11 @@ int idxd_register_dma_device(struct idxd_device *idxd) dma->device_prep_dma_memcpy_sg = idxd_dma_prep_memcpy_sg; } + if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_MEMFILL) { + dma_cap_set(DMA_MEMSET, dma->cap_mask); + dma->device_prep_dma_memset = idxd_dma_prep_memset; + } + dma->device_tx_status = idxd_dma_tx_status; dma->device_issue_pending = idxd_dma_issue_pending; dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources; diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 982cf9915341b7e37d0b1dc038495b5ef27db800..acee6cadd80e915f0fa68febe8fb415aa172f498 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -92,6 +92,7 @@ union engine_cap_reg { #define IDXD_OPCAP_NOOP 0x0001 #define IDXD_OPCAP_BATCH 0x0002 #define IDXD_OPCAP_MEMMOVE 0x0008 +#define IDXD_OPCAP_MEMFILL BIT(DSA_OPCODE_MEMFILL) struct opcap { u64 bits[4]; }; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index ade24eb04950fe75829912633ab2ca156099af87..5e1a6be82aeb61e3da0108e43b3cfa3ffa974b13 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -173,6 +173,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "DupText: ", global_node_page_state(NR_DUPTEXT)); #endif +#ifdef CONFIG_PAGE_PREZERO + show_val_kb(m, "MemZeroed: ", + global_zone_page_state(NR_ZEROED_PAGES)); +#endif hugetlb_report_meminfo(m); diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 2f400f83040c2ef90d2cf31e163a3105b6d52751..5bd995a479dc7b1bab31525e66045f09c827df8a 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -189,6 +189,8 @@ struct dma_interleaved_template { * transaction is marked with DMA_PREP_REPEAT will cause the new transaction * to never be processed and stay in the issued queue forever. The flag is * ignored if the previous transaction is not a repeated transaction. + * @DMA_PREP_NONTEMPORAL - tell the driver that the transaction shall + * direct data writes to memory instead of CPU cache if hardware supports. */ enum dma_ctrl_flags { DMA_PREP_INTERRUPT = (1 << 0), @@ -201,6 +203,7 @@ enum dma_ctrl_flags { DMA_PREP_CMD = (1 << 7), DMA_PREP_REPEAT = (1 << 8), DMA_PREP_LOAD_EOT = (1 << 9), + DMA_PREP_NONTEMPORAL = (1 << 10), }; /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 23cfe3287d07f9affbefcd5ab7cb189fe57e4bbd..7e7e46f2136a73bc79ad88061499c9b6d9832b37 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -98,6 +98,9 @@ extern int page_group_by_mobility_disabled; struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; +#ifdef CONFIG_PAGE_PREZERO + unsigned long nr_zeroed; /* Pre-zeroed pages */ +#endif }; static inline struct page *get_page_from_free_area(struct free_area *area, @@ -161,6 +164,9 @@ enum zone_stat_item { NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, +#ifdef CONFIG_PAGE_PREZERO + NR_ZEROED_PAGES, /* Pre-zeroed pages */ +#endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { @@ -599,6 +605,10 @@ struct zone { bool contiguous; +#ifdef CONFIG_PAGE_PREZERO + bool alloc_zero; +#endif + ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 64e4ac9a2a2f7ac92dd9853dd575d43c89634af9..4798742b534389a5d13985db91a28e4139e41606 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -180,6 +180,10 @@ enum pageflags { /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, +#ifdef CONFIG_PAGE_PREZERO + /* Only valid for buddy pages. Used to track pages that are zeroed */ + PG_zeroed = PG_dirty, +#endif }; #ifndef __GENERATING_BOUNDS_H @@ -317,6 +321,9 @@ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname) \ static inline void ClearPage##uname(struct page *page) { } +#define __SETPAGEFLAG_NOOP(uname) \ +static inline void __SetPage##uname(struct page *page) { } + #define __CLEARPAGEFLAG_NOOP(uname) \ static inline void __ClearPage##uname(struct page *page) { } @@ -458,6 +465,14 @@ PAGEFLAG(Idle, idle, PF_ANY) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_PAGE_PREZERO +__PAGEFLAG(Zeroed, zeroed, PF_NO_COMPOUND) +#else +TESTPAGEFLAG_FALSE(Zeroed) +__SETPAGEFLAG_NOOP(Zeroed) +__CLEARPAGEFLAG_NOOP(Zeroed) +#endif + #ifdef CONFIG_DUPTEXT /* PageDup() is used to track page that has NUMA replicas. */ PAGEFLAG(Dup, dup, PF_HEAD) diff --git a/include/linux/prezero.h b/include/linux/prezero.h new file mode 100644 index 0000000000000000000000000000000000000000..365ecef6dc369865d5f60e45abd05b73f2ef44e8 --- /dev/null +++ b/include/linux/prezero.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PREZERO_H +#define _LINUX_PREZERO_H + +#include + +enum prezero_flag { + PREZERO_BUDDY_FLAG, + PREZERO_PCP_FLAG, + PREZERO_MAX_FLAG, +}; + +#ifdef CONFIG_PAGE_PREZERO +DECLARE_STATIC_KEY_FALSE(prezero_enabled_key); +extern unsigned long prezero_enabled_flag; + +static inline bool prezero_enabled(void) +{ + return static_branch_unlikely(&prezero_enabled_key); +} + +static inline bool prezero_buddy_enabled(void) +{ + return prezero_enabled() && + (prezero_enabled_flag & (1 << PREZERO_BUDDY_FLAG)); +} + +static inline bool prezero_pcp_enabled(void) +{ + return prezero_enabled() && + (prezero_enabled_flag & (1 << PREZERO_PCP_FLAG)); +} + +#else +static inline bool prezero_enabled(void) +{ + return false; +} + +static inline bool prezero_buddy_enabled(void) +{ + return false; +} + +static inline bool prezero_pcp_enabled(void) +{ + return false; +} +#endif /* CONFIG_PAGE_PREZERO */ + +#endif /* _LINUX_PREZERO_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e37bd76f6b31d88951a9cededaec63..7706d404aa0e8e75826679767209057c7c49a6d2 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -120,6 +120,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_PAGE_PREZERO + PREZERO_ALLOC, + PREZERO_ALLOC_PAGES, + PREZERO_HW_CLEAR, + PREZERO_HW_CLEAR_PAGES, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/Kconfig b/mm/Kconfig index 7671747114d3d02ae7ffff0eba38536091a5a0b2..c5312bc1c35d125f9e19eb882563daa30f1995c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -945,4 +945,17 @@ config FAST_COPY_MM unserviceable duration. Note that it won't speed up child's return from fork(2). +config PAGE_PREZERO + bool "Clear page asynchronously" + depends on !INIT_ON_ALLOC_DEFAULT_ON + default n + help + This feature enables per-node kernel threads to clear (zero) buddy + pages asynchronously in advance to build a pool of pre-zeroed pages + on each NUMA node, and speeds up __GFP_ZERO page allocation. + + Furthermore, the work of page clear can be offloaded and accelerated + with accelerators like Intel DSA. User may configure the DMA device + on each NUMA node before enabling this feature. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 42080afa8cd0639f0dba429f414ea0fabe70fd94..ab7463c31cbc663aa3fef497ad23dcb18ddd1c8f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -132,3 +132,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-y += unevictable.o obj-$(CONFIG_DUPTEXT) += page_dup.o obj-$(CONFIG_FAST_COPY_MM) += fast_copy_mm.o +obj-$(CONFIG_PAGE_PREZERO) += prezero.o diff --git a/mm/compaction.c b/mm/compaction.c index c6d55f1b627dd11e28c971cdd553e50ff8d3d261..4b3fb687480807b1d4cfed5f8d60945a0da97268 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -97,6 +97,7 @@ static void split_map_pages(struct list_head *list) order = page_private(page); nr_pages = 1 << order; + set_page_private(page, 0); post_alloc_hook(page, order, __GFP_MOVABLE); if (order) split_page(page, order); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 904773bb34f970575bf11cb34ccfb83459a7886f..1edf983f84d354466a15bc7c9c500aa9828b416f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -719,7 +720,7 @@ unsigned long hugetext_get_unmapped_area(struct file *filp, unsigned long addr, #endif /* CONFIG_HUGETEXT */ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, - struct page *page, gfp_t gfp) + struct page *page, gfp_t gfp, bool zeroed) { struct vm_area_struct *vma = vmf->vma; pgtable_t pgtable; @@ -742,7 +743,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, goto release; } - clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + if (!zeroed) + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + /* * The memory barrier inside __SetPageUptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -854,6 +857,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) gfp_t gfp; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + bool zeroed = false; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; @@ -900,13 +904,19 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = alloc_hugepage_direct_gfpmask(vma); + + if (prezero_enabled()) { + gfp |= __GFP_ZERO; + zeroed = true; + } + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(vmf, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp, zeroed); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/internal.h b/mm/internal.h index 404a44e457e9d24a8b24e68b269b404d48d5b008..daad3493bd67b9fad80d414c1a420b0e23cc7511 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -270,6 +270,38 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, #endif +/* + * Reuse the bit above highest-possible page order (MAX_ORDER - 1) of + * page->private, to _temporarily_ indicate that the page is pre-zeroed. + * + * This bit is only used for pages newly allocated from buddy, neither + * buddy pages nor lru pages, etc., in the page allocation path. + * + * Specifically, this bit is set in __rmqueue_smallest(), and cleared in + * prep_new_page() or free_pcppages_bulk(). Setting this bit anywhere else + * is a bug. + */ +#ifdef CONFIG_PAGE_PREZERO +#define PAGE_ZEROED (1UL << (ilog2(MAX_ORDER - 1) + 1)) +#else +#define PAGE_ZEROED 0 +#endif + +static inline bool page_zeroed(struct page *page) +{ + return page_private(page) & PAGE_ZEROED; +} + +static inline void set_page_zeroed(struct page *page) +{ + page->private |= PAGE_ZEROED; +} + +static inline void clear_page_zeroed(struct page *page) +{ + page->private &= ~PAGE_ZEROED; +} + /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f2fcaf55cb9d2a00979cc3b5e8e3e841cde7c71..f96d84154ce71fbc306fc593f438ca2006584d73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -943,6 +944,15 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, if (page_reported(page)) __ClearPageReported(page); +#ifdef CONFIG_PAGE_PREZERO + /* clear pre-zeroed state */ + if (PageZeroed(page)) { + __ClearPageZeroed(page); + zone->free_area[order].nr_zeroed--; + __mod_zone_page_state(zone, NR_ZEROED_PAGES, -(1 << order)); + } +#endif + list_del(&page->lru); __ClearPageBuddy(page); set_page_private(page, 0); @@ -1041,6 +1051,10 @@ static inline void __free_one_page(struct page *page, goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; + + /* Clear PG_zeroed when merging. */ + __ClearPageZeroed(page); + /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -1218,6 +1232,18 @@ static void kernel_init_free_pages(struct page *page, int numpages) { int i; +#ifdef CONFIG_PAGE_PREZERO + /* + * Skip clear if page is pre-zeroed. + * But force clear if !prezero_enabled(). + */ + if (prezero_enabled() && page_zeroed(page)) { + count_vm_event(PREZERO_ALLOC); + __count_vm_events(PREZERO_ALLOC_PAGES, numpages); + return; + } +#endif + /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) @@ -1284,6 +1310,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; + set_page_private(page, 0); if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) @@ -1426,6 +1453,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); pcp->count--; + /* + * PAGE_ZEROED bit may be set for pcp pages, see + * comments in __rmqueue_smallest(). Clear this bit + * if any. + */ + clear_page_zeroed(page); + if (bulkfree_pcp_prepare(page)) continue; @@ -2208,7 +2242,7 @@ void __init init_cma_reserved_pageblock(struct page *page) * -- nyc */ static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) + int low, int high, int migratetype, bool zeroed) { unsigned long size = 1 << high; @@ -2226,8 +2260,20 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - add_to_free_list(&page[size], zone, high, migratetype); set_buddy_order(&page[size], high); +#ifdef CONFIG_PAGE_PREZERO + if (zeroed) { + add_to_free_list_tail(&page[size], zone, high, + migratetype); + __SetPageZeroed(&page[size]); + zone->free_area[high].nr_zeroed++; + __mod_zone_page_state(zone, NR_ZEROED_PAGES, 1 << high); + } else { + add_to_free_list(&page[size], zone, high, migratetype); + } +#else + add_to_free_list(&page[size], zone, high, migratetype); +#endif } } @@ -2315,7 +2361,7 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { - set_page_private(page, 0); + WARN_ON_ONCE(page_private(page) & ~PAGE_ZEROED); set_page_refcounted(page); arch_alloc_page(page, order); @@ -2334,6 +2380,9 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) kernel_init_free_pages(page, 1 << order); + /* Clear pre-zeroed state (PAGE_ZEROED bit) if any. */ + clear_page_zeroed(page); + if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2360,16 +2409,41 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, unsigned int current_order; struct free_area *area; struct page *page; + struct list_head __maybe_unused *list; /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { + bool zeroed; area = &(zone->free_area[current_order]); +#ifdef CONFIG_PAGE_PREZERO + list = &area->free_list[migratetype]; + + if (unlikely(list_empty(list))) + page = NULL; + else if (zone->alloc_zero) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); +#else page = get_page_from_free_area(area, migratetype); +#endif + if (!page) continue; + + /* Stash this away before del_page_from_free_list() zaps it */ + zeroed = PageZeroed(page); + del_page_from_free_list(page, zone, current_order); - expand(zone, page, order, current_order, migratetype); + expand(zone, page, order, current_order, migratetype, zeroed); set_pcppage_migratetype(page, migratetype); + /* + * NOTE This is a hack. The pre-zeroed state was zapped + * above and restored here, and should finally be cleared + * in prep_new_page() or free_pcppages_bulk(). + */ + if (zeroed) + set_page_zeroed(page); return page; } @@ -2930,11 +3004,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, unsigned int alloc_flags) + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { int i, alloced = 0; spin_lock(&zone->lock); +#ifdef CONFIG_PAGE_PREZERO + zone->alloc_zero = prezero_pcp_enabled() && (gfp_flags & __GFP_ZERO); +#endif for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -2968,6 +3046,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); +#ifdef CONFIG_PAGE_PREZERO + zone->alloc_zero = false; +#endif spin_unlock(&zone->lock); return alloced; } @@ -3423,17 +3504,16 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, - unsigned int alloc_flags, - struct per_cpu_pages *pcp, - struct list_head *list) +static struct page *__rmqueue_pcplist(struct zone *zone, gfp_t gfp_flags, + int migratetype, unsigned int alloc_flags, + struct per_cpu_pages *pcp, struct list_head *list) { struct page *page; do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, + pcp->batch, list, gfp_flags, migratetype, alloc_flags); if (unlikely(list_empty(list))) return NULL; @@ -3460,7 +3540,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + page = __rmqueue_pcplist(zone, gfp_flags, migratetype, alloc_flags, + pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); @@ -3501,6 +3582,10 @@ struct page *rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); +#ifdef CONFIG_PAGE_PREZERO + zone->alloc_zero = prezero_buddy_enabled() && (gfp_flags & __GFP_ZERO); +#endif + do { page = NULL; /* @@ -3517,6 +3602,11 @@ struct page *rmqueue(struct zone *preferred_zone, if (!page) page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order)); + +#ifdef CONFIG_PAGE_PREZERO + zone->alloc_zero = false; +#endif + spin_unlock(&zone->lock); if (!page) goto failed; @@ -4196,8 +4286,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, count_vm_event(COMPACTSTALL); /* Prep a captured page if available */ - if (page) + if (page) { + set_page_private(page, 0); prep_new_page(page, order, gfp_mask, alloc_flags); + } /* Try get a page from the freelist if available */ if (!page) @@ -5021,6 +5113,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, page = kfence_alloc_page(order, preferred_nid, gfp_mask); if (unlikely(page)) { + set_page_private(page, 0); prep_new_page(page, 0, gfp_mask, alloc_mask); goto out; } @@ -6278,6 +6371,9 @@ static void __meminit zone_init_free_lists(struct zone *zone) for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; +#ifdef CONFIG_PAGE_PREZERO + zone->free_area[order].nr_zeroed = 0; +#endif } } diff --git a/mm/prezero.c b/mm/prezero.c new file mode 100644 index 0000000000000000000000000000000000000000..efe3b24a16ca643b4eec70f62e2eba97252a87c9 --- /dev/null +++ b/mm/prezero.c @@ -0,0 +1,641 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +DEFINE_STATIC_KEY_FALSE(prezero_enabled_key); +unsigned long prezero_enabled_flag; +static unsigned int prezero_min_order = 9; +static unsigned int prezero_max_percent = 50; +static unsigned int prezero_batch_pages = 4096; +static unsigned int prezero_sleep_msecs = 1000; +static struct task_struct *prezero_kthread[MAX_NUMNODES]; +static wait_queue_head_t kprezerod_wait[MAX_NUMNODES]; +static unsigned long kprezerod_sleep_expire[MAX_NUMNODES]; + +static DEFINE_STATIC_KEY_FALSE(prezero_hw_enabled_key); +static bool prezero_hw_flag_cc; +static bool prezero_hw_polling; +static inline bool prezero_hw_enabled(void) +{ + return static_branch_unlikely(&prezero_hw_enabled_key); +} +static int clear_page_hw(struct page *page, int order, int node); + +static void my_clear_page(struct page *page, unsigned int order, int node) +{ + int i, numpages = 1 << order; + + if (prezero_hw_enabled() && + !clear_page_hw(page, order, node)) { + count_vm_event(PREZERO_HW_CLEAR); + __count_vm_events(PREZERO_HW_CLEAR_PAGES, numpages); + return; + } + + for (i = 0; i < numpages; i++) + clear_highpage(page + i); +} + +static int prezero_one_page(struct zone *zone, unsigned int order, int mtype) +{ + struct free_area *area = &zone->free_area[order]; + struct list_head *list = &area->free_list[mtype]; + struct page *page_to_zero = NULL, *page, *next; + int err = -ENOMEM; + + /* + * Perform early check, if free area is empty there is + * nothing to process so we can skip this free_list. + */ + if (list_empty(list)) + return err; + + /* Isolate a non-zeroed page */ + spin_lock_irq(&zone->lock); + list_for_each_entry_safe(page, next, list, lru) { + /* We are going to skip over the pre-zeroed pages. */ + if (PageZeroed(page)) + continue; + + if (__isolate_free_page(page, order)) + page_to_zero = page; + else + next = page; + + /* + * Make the next page in the free list the new head + * of the free list before we release the zone lock. + */ + if (&next->lru != list && !list_is_first(&next->lru, list)) + list_rotate_to_front(&next->lru, list); + + break; + } + spin_unlock_irq(&zone->lock); + + /* Failed to isolate non-zeroed page */ + if (!page_to_zero) + return err; + + /* Clear the page */ + my_clear_page(page, order, zone_to_nid(zone)); + + /* Putback the pre-zeroed page */ + spin_lock_irq(&zone->lock); + mtype = get_pageblock_migratetype(page); + __putback_isolated_page(page, order, mtype); + + /* + * If page was not comingled with another page we can consider + * the page to be zeroed since the page hasn't been modified, + * otherwise we will need to discard the zeroed state of this page. + */ + if (PageBuddy(page) && buddy_order(page) == order) { + __SetPageZeroed(page); + zone->free_area[order].nr_zeroed++; + __mod_zone_page_state(zone, NR_ZEROED_PAGES, 1 << order); + } + + spin_unlock_irq(&zone->lock); + + return err; +} + +static void prezero_do_work(pg_data_t *pgdat) +{ + struct zone *zone = &pgdat->node_zones[ZONE_NORMAL]; + /* NOTE only MIGRATE_MOVABLE is supported currently */ + int mtype = MIGRATE_MOVABLE; + unsigned int order; + unsigned long nr_free, nr_zeroed; + unsigned int nr_done; + + for (order = prezero_min_order; order < MAX_ORDER; order++) { + /* + * Use data_race to avoid KCSAN warning since access + * to nr_free and nr_zeroed is lockless here. + * + * Since only MIGRATE_MOVABLE is supported at present, + * to set prezero_max_percent too high could prevent + * kprezerod from early bailing out. + */ + nr_free = data_race(zone->free_area[order].nr_free); + /* Ditto. */ + nr_zeroed = data_race(zone->free_area[order].nr_zeroed); + + if (nr_zeroed >= nr_free * prezero_max_percent / 100) + continue; + + nr_done = 0; + while (nr_done < prezero_batch_pages) { + if (prezero_one_page(zone, order, mtype) < 0) + break; + nr_done += 1 << order; + } + } +} + +static bool kprezerod_should_wakeup(int nid) +{ + return kthread_should_stop() || + time_after_eq(jiffies, kprezerod_sleep_expire[nid]); +} + +static int prezero(void *data) +{ + pg_data_t *pgdat = (pg_data_t *)data; + int nid = pgdat->node_id; + + set_freezable(); + + while (!kthread_should_stop()) { + unsigned long sleep_jiffies = + msecs_to_jiffies(prezero_sleep_msecs); + + kprezerod_sleep_expire[nid] = jiffies + sleep_jiffies; + if (wait_event_freezable_timeout(kprezerod_wait[nid], + kprezerod_should_wakeup(nid), + sleep_jiffies)) + prezero_do_work(pgdat); + } + + return 0; +} + +static void __start_stop_kprezerod(int nid) +{ + if (prezero_enabled()) { + if (!prezero_kthread[nid]) + prezero_kthread[nid] = kthread_run(prezero, + NODE_DATA(nid), "kprezerod%d", nid); + if (IS_ERR(prezero_kthread[nid])) { + pr_err("failed to run kprezerod on node %d\n", nid); + prezero_kthread[nid] = NULL; + } + } else if (prezero_kthread[nid]) { + kthread_stop(prezero_kthread[nid]); + prezero_kthread[nid] = NULL; + } +} + +static void start_stop_kprezerod(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + __start_stop_kprezerod(nid); +} + +/* + * Page clear engine support - hardware offloading for page clear. + * + * Page clear engine allows to use a DMA device through the dmaengine API + * to clear (zero) page asynchronously. + * + * User may configure the DMA device on each NUMA node before enabling this + * feature. + */ +#define DMA_TIMEOUT 5000 +static DEFINE_MUTEX(nodedata_mutex); +static struct nodedata { + struct dma_chan *dma_chan; +} *nodedata; + +static void dma_completion_callback(void *arg) +{ + struct completion *done = arg; + + complete(done); +} + +/* + * DMA engine APIs are called to prepare and submit DMA descriptors, and to + * check completion status. The dest_addr of descriptor is filled with the DMA + * mapped address of the page to be cleared. + */ +static int clear_page_hw(struct page *page, int order, int node) +{ + struct dma_chan *dma_chan = NULL; + struct device *dev; + struct dma_async_tx_descriptor *tx = NULL; + dma_addr_t dst_dma; + dma_cookie_t cookie; + enum dma_status status; + unsigned long dma_flags = 0; + bool hw_flag_cc = prezero_hw_flag_cc; + bool hw_polling = prezero_hw_polling; + int ret = 0; + DECLARE_COMPLETION_ONSTACK(done); + + mutex_lock(&nodedata_mutex); + /* Page clear engine is already disabled */ + if (!nodedata) { + ret = -ENODEV; + goto err_nodedata; + } + + dma_chan = nodedata[node].dma_chan; + dev = dma_chan->device->dev; + + /* DMA map page */ + dst_dma = dma_map_page(dev, page, 0, PAGE_SIZE << order, + DMA_FROM_DEVICE); + ret = dma_mapping_error(dev, dst_dma); + if (ret) + goto err_nodedata; + + if (!hw_flag_cc) + dma_flags |= DMA_PREP_NONTEMPORAL; + + if (!hw_polling) + dma_flags |= DMA_PREP_INTERRUPT; + + /* Prep DMA memset */ + tx = dmaengine_prep_dma_memset(dma_chan, dst_dma, 0, + PAGE_SIZE << order, dma_flags); + if (!tx) { + pr_info("Failed to prep DMA memset on node %d\n", node); + ret = -EIO; + goto err_prep; + } + + if (!hw_polling) { + tx->callback = dma_completion_callback; + tx->callback_param = &done; + } + + /* Submit DMA descriptor */ + cookie = dmaengine_submit(tx); + if (dma_submit_error(cookie)) { + pr_info("Failed to submit DMA descriptor on node %d\n", node); + ret = -EIO; + goto err_prep; + } + + if (hw_polling) { + /* Check DMA completion status with polling */ + status = dma_sync_wait(dma_chan, cookie); + if (status != DMA_COMPLETE) { + pr_info("Failed to poll DMA completion status on node %d\n", node); + ret = -EIO; + } + } else { + dma_async_issue_pending(dma_chan); + if (!wait_for_completion_timeout(&done, + msecs_to_jiffies(DMA_TIMEOUT))) { + ret = -EIO; + goto err_prep; + } + status = dma_async_is_tx_complete(dma_chan, cookie); + if (status != DMA_COMPLETE) { + pr_info("Failed to check DMA completion status on node %d\n", node); + ret = -EIO; + } + } + +err_prep: + dma_unmap_page(dev, dst_dma, PAGE_SIZE << order, DMA_FROM_DEVICE); +err_nodedata: + mutex_unlock(&nodedata_mutex); + return ret; +} + +static bool engine_filter_fn(struct dma_chan *chan, void *node) +{ + return dev_to_node(&chan->dev->device) == (int)(unsigned long)node; +} + +/* + * It initially requests a DMA channel with DMA_MEMSET capability on each NUMA + * node and uses the DMA device to clear high order pages. + * + * The preference is to request the DMA channel from local NUMA node. If it is + * not available, try again to request the DMA channel from any NUMA node. + */ +static int get_dma_chan(int node) +{ + dma_cap_mask_t mask; + + /* Request DMA channel by mask */ + dma_cap_zero(mask); + dma_cap_set(DMA_MEMSET, mask); + + /* Prefer to request DMA channel from local NUMA node if available */ + nodedata[node].dma_chan = dma_request_channel(mask, engine_filter_fn, + (void *)(unsigned long)node); + if (!nodedata[node].dma_chan) { + /* Try again to request the DMA channel from any NUMA node */ + nodedata[node].dma_chan = dma_request_chan_by_mask(&mask); + if (IS_ERR(nodedata[node].dma_chan)) { + pr_info("Failed to request DMA channel on node %d\n", node); + nodedata[node].dma_chan = NULL; + return -ENODEV; + } + } + + return 0; +} + +static int init_page_clear_engine(void) +{ + int node, num_nodes; + int ret; + + /* Page clear engine is already enabled */ + if (nodedata) + return 0; + + num_nodes = num_online_nodes(); + nodedata = kcalloc(num_nodes, sizeof(*nodedata), GFP_KERNEL); + if (!nodedata) + return -ENOMEM; + + for_each_online_node(node) { + ret = get_dma_chan(node); + if (ret) + goto fail; + } + + pr_info("Hardware page clear engine is enabled\n"); + return 0; + +fail: + for (node = 0; node < num_nodes; node++) { + if (nodedata[node].dma_chan) + dma_release_channel(nodedata[node].dma_chan); + } + + kfree(nodedata); + nodedata = NULL; + + return ret; +} + +static void exit_page_clear_engine(void) +{ + int node; + + /* Page clear engine is already disabled */ + if (!nodedata) + return; + + mutex_lock(&nodedata_mutex); + for_each_online_node(node) { + dma_release_channel(nodedata[node].dma_chan); + } + + kfree(nodedata); + nodedata = NULL; + mutex_unlock(&nodedata_mutex); + + pr_info("Hardware page clear engine is disabled\n"); +} + +static int __init setup_prezero(char *str) +{ + unsigned long val; + int err; + + if (!str) + return 0; + + err = kstrtoul(str, 0, &val); + if (err < 0 || val > (1UL << PREZERO_MAX_FLAG) - 1) + return 0; + + prezero_enabled_flag = val; + + return 1; +} +__setup("prezero=", setup_prezero); + +#ifdef CONFIG_SYSFS +static ssize_t prezero_show_enabled(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", prezero_enabled_flag); +} +static ssize_t prezero_store_enabled(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + unsigned long val; + int err; + ssize_t ret = count; + + mutex_lock(&mutex); + + err = kstrtoul(buf, 0, &val); + if (err < 0 || val > (1UL << PREZERO_MAX_FLAG) - 1) { + ret = -EINVAL; + goto out; + } + + prezero_enabled_flag = val; + + if (prezero_enabled_flag) + static_branch_enable(&prezero_enabled_key); + else + static_branch_disable(&prezero_enabled_key); + + start_stop_kprezerod(); + +out: + mutex_unlock(&mutex); + return ret; +} +static struct kobj_attribute prezero_attr_enabled = + __ATTR(enabled, 0644, prezero_show_enabled, + prezero_store_enabled); + +#define PREZERO_SYSFS_ATTR(name, field, min_val, max_val, store_cb) \ +static ssize_t prezero_show_##name(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%u\n", field); \ +} \ +static ssize_t prezero_store_##name(struct kobject *kobj, \ + struct kobj_attribute *attr, const char *buf, size_t count) \ +{ \ + unsigned long val; \ + int ret; \ + \ + ret = kstrtoul(buf, 0, &val); \ + if (ret || val < min_val || val > max_val) \ + return -EINVAL; \ + \ + field = val; \ + store_cb(); \ + return count; \ +} \ +static struct kobj_attribute prezero_attr_##name = \ + __ATTR(name, 0644, prezero_show_##name, prezero_store_##name) + +static void dummy_store_cb(void) +{ +} + +static void prezero_sleep_msecs_store_cb(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + kprezerod_sleep_expire[nid] = 0; + wake_up_interruptible(&kprezerod_wait[nid]); + } +} + +PREZERO_SYSFS_ATTR(min_order, prezero_min_order, 0, MAX_ORDER - 1, + dummy_store_cb); +PREZERO_SYSFS_ATTR(max_percent, prezero_max_percent, 0, 100, + dummy_store_cb); +PREZERO_SYSFS_ATTR(batch_pages, prezero_batch_pages, 0, UINT_MAX, + dummy_store_cb); +PREZERO_SYSFS_ATTR(sleep_msecs, prezero_sleep_msecs, 0, UINT_MAX, + prezero_sleep_msecs_store_cb); + +static struct attribute *prezero_attrs[] = { + &prezero_attr_enabled.attr, + &prezero_attr_min_order.attr, + &prezero_attr_max_percent.attr, + &prezero_attr_batch_pages.attr, + &prezero_attr_sleep_msecs.attr, + NULL, +}; + +static struct attribute_group prezero_attr_group = { + .attrs = prezero_attrs, +}; + +static ssize_t prezero_show_hw_enabled(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prezero_hw_enabled()); +} +static ssize_t prezero_store_hw_enabled(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + unsigned long val; + int err; + ssize_t ret = count; + + mutex_lock(&mutex); + + err = kstrtoul(buf, 0, &val); + if (err < 0 || val > 1) { + ret = -EINVAL; + goto out; + } + + if (val) { + if (!prezero_hw_enabled()) { + err = init_page_clear_engine(); + if (!err) + static_branch_enable(&prezero_hw_enabled_key); + else + ret = err; + } + } else { + if (prezero_hw_enabled()) { + static_branch_disable(&prezero_hw_enabled_key); + exit_page_clear_engine(); + } + } + +out: + mutex_unlock(&mutex); + return ret; +} +static struct kobj_attribute prezero_attr_hw_enabled = + __ATTR(hw_enabled, 0644, prezero_show_hw_enabled, + prezero_store_hw_enabled); + +PREZERO_SYSFS_ATTR(hw_flag_cc, prezero_hw_flag_cc, 0, 1, dummy_store_cb); +PREZERO_SYSFS_ATTR(hw_polling, prezero_hw_polling, 0, 1, dummy_store_cb); + +static struct attribute *page_clear_engine_attrs[] = { + &prezero_attr_hw_enabled.attr, + &prezero_attr_hw_flag_cc.attr, + &prezero_attr_hw_polling.attr, + NULL, +}; + +static struct attribute_group page_clear_engine_attr_group = { + .attrs = page_clear_engine_attrs, + .name = "page_clear_engine", +}; + +static int __init prezero_sysfs_init(void) +{ + struct kobject *prezero_kobj; + int err; + + /* + * err = sysfs_create_group(mm_kobj, &prezero_attr_group); + * if (err) + * pr_err("failed to register prezero group\n"); + */ + + + prezero_kobj = kobject_create_and_add("prezero", mm_kobj); + if (unlikely(!prezero_kobj)) { + pr_err("failed to create prezero kobject\n"); + return -ENOMEM; + } + + err = sysfs_create_group(prezero_kobj, &prezero_attr_group); + if (err) { + pr_err("failed to register prezero group\n"); + goto delete_obj; + } + + err = sysfs_create_group(prezero_kobj, &page_clear_engine_attr_group); + if (err) { + pr_err("failed to register page_clear_engine group\n"); + goto remove_prezero_group; + } + + return 0; + +remove_prezero_group: + sysfs_remove_group(prezero_kobj, &prezero_attr_group); +delete_obj: + kobject_put(prezero_kobj); + return err; +} +#else +static inline int __init prezero_sysfs_init(void) +{ + return 0; +} +#endif /* CONFIG_SYSFS */ + +static int __init prezero_init(void) +{ + int ret; + int nid; + + ret = prezero_sysfs_init(); + if (ret < 0) + return ret; + + for_each_node_state(nid, N_MEMORY) { + init_waitqueue_head(&kprezerod_wait[nid]); + __start_stop_kprezerod(nid); + } + + return 0; +} +module_init(prezero_init); diff --git a/mm/vmstat.c b/mm/vmstat.c index 498ea2e06c0feb725993eb37eb56a933793d228f..0b6d3aa29ddf95378310f127383948eb552921ba 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1163,6 +1163,9 @@ const char * const vmstat_text[] = { "nr_zspages", #endif "nr_free_cma", +#ifdef CONFIG_PAGE_PREZERO + "nr_zeroed_pages", +#endif /* enum numa_stat_item counters */ #ifdef CONFIG_NUMA @@ -1353,6 +1356,12 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_PAGE_PREZERO + "prezero_alloc", + "prezero_alloc_pages", + "prezero_hw_clear", + "prezero_hw_clear_pages", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1600,6 +1609,34 @@ static const struct seq_operations pagetypeinfo_op = { .show = pagetypeinfo_show, }; +#ifdef CONFIG_PAGE_PREZERO +static void zerobuddy_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int order; + + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_zeroed); + seq_putc(m, '\n'); +} + +static int zerobuddy_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, true, false, zerobuddy_show_print); + return 0; +} + +static const struct seq_operations zerobuddy_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = zerobuddy_show, +}; +#endif + static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) { int zid; @@ -1628,6 +1665,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n pages free %lu" +#ifdef CONFIG_PAGE_PREZERO + "\n zeroed %lu" +#endif "\n min %lu" "\n low %lu" "\n high %lu" @@ -1636,6 +1676,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n managed %lu" "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), +#ifdef CONFIG_PAGE_PREZERO + zone_page_state(zone, NR_ZEROED_PAGES), +#endif min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -2049,6 +2092,9 @@ void __init init_mm_internals(void) proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); proc_create_seq("vmstat", 0444, NULL, &vmstat_op); proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); +#ifdef CONFIG_PAGE_PREZERO + proc_create_seq("zerobuddyinfo", 0444, NULL, &zerobuddy_op); +#endif #endif }