diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 45af9a989d4040135a4fe18acc9b5ef055a2affc..875306ca2b397a12eda150c7a79b13b529b4c8ab 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,6 +17,9 @@ #ifdef CONFIG_CMA #include #endif +#ifdef CONFIG_MEM_PURGEABLE +#include +#endif #include #include #include "internal.h" @@ -40,6 +43,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; + unsigned long nr_purg_active = 0; + unsigned long nr_purg_inactive = 0; +#ifdef CONFIG_MEM_PURGEABLE + unsigned long nr_purg_pined = 0; +#endif si_meminfo(&i); si_swapinfo(&i); @@ -53,6 +61,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); +#ifdef CONFIG_MEM_PURGEABLE + nr_purg_active = pages[LRU_ACTIVE_PURGEABLE]; + nr_purg_inactive = pages[LRU_INACTIVE_PURGEABLE]; + purg_pages_info(NULL, &nr_purg_pined); + nr_purg_pined = min(nr_purg_pined, nr_purg_active + nr_purg_inactive); +#endif + available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); @@ -64,13 +79,25 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + - pages[LRU_ACTIVE_FILE]); +#ifdef CONFIG_MEM_PURGEABLE + pages[LRU_ACTIVE_FILE] + + nr_purg_active); +#else + pages[LRU_ACTIVE_FILE]); +#endif + show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + - pages[LRU_INACTIVE_FILE]); + pages[LRU_INACTIVE_FILE] + + nr_purg_inactive); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); +#ifdef CONFIG_MEM_PURGEABLE + show_val_kb(m, "Active(purg): ", nr_purg_active); + show_val_kb(m, "Inactive(purg): ", nr_purg_inactive); + show_val_kb(m, "Pined(purg): ", nr_purg_pined); +#endif show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bd8285811728757693ee96a17c6a622280cf9dd1..415a5bef74246e7c6185f4bdbeb6a163427a4cc9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_MEM_PURGEABLE +#include +#endif #include #include @@ -33,6 +36,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; +#ifdef CONFIG_MEM_PURGEABLE + unsigned long nr_purg_sum = 0, nr_purg_pin = 0; + + mm_purg_pages_info(mm, &nr_purg_sum, &nr_purg_pin); +#endif anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES); @@ -76,6 +84,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) seq_put_decimal_ull_width(m, " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); +#ifdef CONFIG_MEM_PURGEABLE + SEQ_PUT_DEC(" kB\nPurgSum:\t", nr_purg_sum); + SEQ_PUT_DEC(" kB\nPurgPin:\t", nr_purg_pin); +#endif seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 7179e3f6a0304b87cdfdf004383b55c301ac93b6..83f791100b84993e5f2de27a93c137ecc84c4a4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -320,16 +320,34 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_7 39 /* bit only usable on 64-bit architectures */ +#ifdef CONFIG_MEM_PURGEABLE +#define VM_HIGH_ARCH_BIT_8 40 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_9 41 /* bit only usable on 64-bit architectures */ +#endif /* CONFIG_MEM_PURGEABLE */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) +#ifdef CONFIG_MEM_PURGEABLE +#define VM_HIGH_ARCH_8 BIT(VM_HIGH_ARCH_BIT_8) +#define VM_HIGH_ARCH_9 BIT(VM_HIGH_ARCH_BIT_9) +#endif /* CONFIG_MEM_PURGEABLE */ #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ +#ifdef CONFIG_MEM_PURGEABLE +#define VM_PURGEABLE VM_HIGH_ARCH_8 +#define VM_USEREXPTE VM_HIGH_ARCH_9 +#else /* CONFIG_MEM_PURGEABLE */ +#define VM_PURGEABLE 0 +#define VM_USEREXPTE 0 +#endif /* CONFIG_MEM_PURGEABLE */ + #ifdef CONFIG_SECURITY_XPM #define VM_XPM VM_HIGH_ARCH_7 #else /* CONFIG_SECURITY_XPM */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 96b1c157554c08c5ec8db06a045501738426231e..027591c9decb50d2b570881f51edc631f5afea1b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -93,6 +93,10 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; +#ifdef CONFIG_MEM_PURGEABLE + if (folio_test_purgeable(folio)) + lru = LRU_INACTIVE_PURGEABLE; +#endif if (folio_test_active(folio)) lru += LRU_ACTIVE; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index db7003d2886f0e687e7ad4d27475b82a561e7404..7264a43f8d18464037f2ac917ebedbb612c02ab2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -703,6 +703,10 @@ struct mm_struct { #endif unsigned long task_size; /* size of task vm space */ pgd_t * pgd; +#ifdef CONFIG_MEM_PURGEABLE + void *uxpgd; + spinlock_t uxpgd_lock; +#endif #ifdef CONFIG_MEMBARRIER /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9cf03644fe90a5902c39195a77aae4f84cf8a68f..84da48194dbc3a96d70283a88a087753498ce5f3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -146,6 +146,10 @@ enum zone_stat_item { NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, +#ifdef CONFIG_MEM_PURGEABLE + NR_ZONE_INACTIVE_PURGEABLE, + NR_ZONE_ACTIVE_PURGEABLE, +#endif NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ @@ -166,6 +170,10 @@ enum node_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ +#ifdef CONFIG_MEM_PURGEABLE + NR_INACTIVE_PURGEABLE, + NR_ACTIVE_PURGEABLE, +#endif NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, @@ -268,12 +276,19 @@ static __always_inline bool vmstat_item_in_bytes(int idx) #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 +#ifdef CONFIG_MEM_PURGEABLE +#define LRU_PURGEABLE 4 +#endif enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, +#ifdef CONFIG_MEM_PURGEABLE + LRU_INACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE, + LRU_ACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE + LRU_ACTIVE, +#endif LRU_UNEVICTABLE, NR_LRU_LISTS }; @@ -288,7 +303,7 @@ enum vmscan_throttle_state { #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) +#define for_each_evictable_lru(lru) for (lru = 0; lru < LRU_UNEVICTABLE; lru++) static inline bool is_file_lru(enum lru_list lru) { @@ -297,6 +312,10 @@ static inline bool is_file_lru(enum lru_list lru) static inline bool is_active_lru(enum lru_list lru) { +#ifdef CONFIG_MEM_PURGEABLE + if (lru == LRU_ACTIVE_PURGEABLE) + return true; +#endif return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4d2e0c913baf37f3293702f95e5c6cad4fe3391d..af9b7524c692a89e46e2c8e64466247af42cb9ec 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -136,6 +136,9 @@ enum pageflags { PG_arch_2, PG_arch_3, #endif +#ifdef CONFIG_MEM_PURGEABLE + PG_purgeable, +#endif #ifdef CONFIG_SECURITY_XPM PG_xpm_readonly, PG_xpm_writetainted, @@ -615,6 +618,12 @@ PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif +#ifdef CONFIG_MEM_PURGEABLE +PAGEFLAG(Purgeable, purgeable, PF_ANY) +#else +PAGEFLAG_FALSE(Purgeable) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 0a224af0e59aa3900a4e7b2282ed251e78f4ebea..5fee97d06e6e5eb7c1696693e1218f1d7c25e1eb 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -65,10 +65,16 @@ __def_gfpflag_names __def_gfpflag_names_kasan \ ) : "none" +#ifdef CONFIG_MEM_PURGEABLE +#define IF_HAVE_PG_PURGEABLE(_name) ,{1UL << PG_##_name, __stringify(_name)} +#else +#define IF_HAVE_PG_PURGEABLE(_name) +#endif + #ifdef CONFIG_SECURITY_XPM -#define IF_HAVE_PG_XPM_INTEGRITY(flag,string) ,{1UL << flag, string} +#define IF_HAVE_PG_XPM_INTEGRITY(_name) ,{1UL << PG_##_name, __stringify(_name)} #else -#define IF_HAVE_PG_XPM_INTEGRITY(flag,string) +#define IF_HAVE_PG_XPM_INTEGRITY(_name) #endif #ifdef CONFIG_MMU @@ -125,6 +131,7 @@ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ DEF_PAGEFLAG_NAME(unevictable) \ +IF_HAVE_PG_PURGEABLE(purgeable) \ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_UNCACHED(uncached) \ IF_HAVE_PG_HWPOISON(hwpoison) \ diff --git a/kernel/fork.c b/kernel/fork.c index 92611a26a392354f33c9f0218ac31b05c2bcd45c..515267609be98a201aae564525a664ef9f01b433 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,7 +99,9 @@ #include #include #include - +#ifdef CONFIG_MEM_PURGEABLE +#include +#endif #include #include #include @@ -793,6 +795,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, static inline int mm_alloc_pgd(struct mm_struct *mm) { +#ifdef CONFIG_MEM_PURGEABLE + mm_init_uxpgd(mm); +#endif mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; @@ -802,6 +807,9 @@ static inline int mm_alloc_pgd(struct mm_struct *mm) static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); +#ifdef CONFIG_MEM_PURGEABLE + mm_clear_uxpgd(mm); +#endif } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) diff --git a/mm/Kconfig b/mm/Kconfig index 544d113729eb649ba9bbfa840193caf9bd6ddbe3..ee4c2cf539d78b4648ff89b43e7e8474b2b0b793 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1303,6 +1303,29 @@ config LOCK_MM_AND_FIND_VMA bool depends on !STACK_GROWSUP + +config MEM_PURGEABLE + bool "Purgeable memory feature" + default n + depends on 64BIT + select ARCH_USES_HIGH_VMA_FLAGS + help + Support purgeable pages for process + +config MEM_PURGEABLE_DEBUG + bool "Purgeable memory debug" + default n + depends on MEM_PURGEABLE + help + Debug info for purgeable memory + +config PURGEABLE_ASHMEM + bool "Purgeable memory feature for ashmem" + default n + depends on MEM_PURGEABLE + help + Support purgeable ashmem for process + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index f9fb7e07cdd854d764ef05768ab65483a9891df9..f84d4b0f521de445aa5a56324ee640e6448971ce 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,4 +141,6 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o +obj-$(CONFIG_MEM_PURGEABLE) += purgeable.o +obj-$(CONFIG_PURGEABLE_ASHMEM) += purgeable_ashmem_trigger.o obj-$(CONFIG_MEMORY_MONITOR) += memory_monitor.o diff --git a/mm/memory.c b/mm/memory.c index 78e05d3e9e4acabcd6fa7ce818d9c773b10fba0b..8a64230a1fec50d4368390c62b273811151d2f05 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,7 +77,9 @@ #include #include #include - +#ifdef CONFIG_MEM_PURGEABLE +#include +#endif #include #include @@ -1426,6 +1428,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, unsigned int delay_rmap; page = vm_normal_page(vma, addr, ptent); +#ifdef CONFIG_MEM_PURGEABLE + if (vma->vm_flags & VM_USEREXPTE) + page = NULL; +#endif if (unlikely(!should_zap_page(details, page))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -1438,7 +1444,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, ksm_might_unmap_zero_page(mm, ptent); continue; } - +#ifdef CONFIG_MEM_PURGEABLE + if (vma->vm_flags & VM_PURGEABLE) + uxpte_clear_present(vma, addr); +#endif delay_rmap = 0; if (!PageAnon(page)) { if (pte_dirty(ptent)) { @@ -3144,6 +3153,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); +#ifdef CONFIG_MEM_PURGEABLE + if (vma->vm_flags & VM_PURGEABLE) { + pr_info("set wp new folio %lx purgeable\n", folio_pfn(new_folio)); + folio_set_purgeable(new_folio); + uxpte_set_present(vma, vmf->address); + } +#endif folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary @@ -4103,11 +4119,23 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; +#ifdef CONFIG_MEM_PURGEABLE + /* use extra page table for userexpte */ + if (vma->vm_flags & VM_USEREXPTE) { + if (do_uxpte_page_fault(vmf, &entry)) + goto oom; + else + goto got_page; + } +#endif /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); +#ifdef CONFIG_MEM_PURGEABLE +got_page: +#endif vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) @@ -4172,8 +4200,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_add_new_anon_rmap(folio, vma, vmf->address); +#ifdef CONFIG_MEM_PURGEABLE + if (vma->vm_flags & VM_PURGEABLE) + folio_set_purgeable(folio); +#endif folio_add_lru_vma(folio, vma); setpte: +#ifdef CONFIG_MEM_PURGEABLE + if (vma->vm_flags & VM_PURGEABLE) + uxpte_set_present(vma, vmf->address); +#endif if (uffd_wp) entry = pte_mkuffd_wp(entry); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); diff --git a/mm/mmap.c b/mm/mmap.c index fb20221968457c9a68f219bdece01b68f0178fb1..fdd2291a987d79e9134cfed0582089036fe0a13c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -59,6 +59,11 @@ #include "internal.h" +#ifdef CONFIG_MEM_PURGEABLE +#define MAP_PURGEABLE 0x04 /* purgeable memory */ +#define MAP_USEREXPTE 0x08 /* userspace extension page table */ +#endif + #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif @@ -1353,6 +1358,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, */ pgoff = addr >> PAGE_SHIFT; break; +#ifdef CONFIG_MEM_PURGEABLE + case MAP_PURGEABLE: + vm_flags |= VM_PURGEABLE; + break; + case MAP_USEREXPTE: + vm_flags |= VM_USEREXPTE; + break; +#endif default: return -EINVAL; } diff --git a/mm/purgeable.c b/mm/purgeable.c new file mode 100644 index 0000000000000000000000000000000000000000..54bee931cb1bed3105fd2766def504a9334cbb07 --- /dev/null +++ b/mm/purgeable.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024 Huawei Device Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include /* find_lock_task_mm */ + +#include + +struct uxpte_t { + atomic64_t val; +}; + +#define UXPTE_SIZE_SHIFT 3 +#define UXPTE_SIZE (1 << UXPTE_SIZE_SHIFT) + +#define UXPTE_PER_PAGE_SHIFT (PAGE_SHIFT - UXPTE_SIZE_SHIFT) +#define UXPTE_PER_PAGE (1 << UXPTE_PER_PAGE_SHIFT) + +#define UXPTE_PRESENT_BIT 1 +#define UXPTE_PRESENT_MASK ((1 << UXPTE_PRESENT_BIT) - 1) +#define UXPTE_REFCNT_ONE (1 << UXPTE_PRESENT_BIT) +#define UXPTE_UNDER_RECLAIM (-UXPTE_REFCNT_ONE) + +#define vpn(vaddr) ((vaddr) >> PAGE_SHIFT) +#define uxpte_pn(vaddr) (vpn(vaddr) >> UXPTE_PER_PAGE_SHIFT) +#define uxpte_off(vaddr) (vpn(vaddr) & (UXPTE_PER_PAGE - 1)) +#define uxpn2addr(uxpn) ((uxpn) << (UXPTE_PER_PAGE_SHIFT + PAGE_SHIFT)) +#define uxpte_refcnt(uxpte) ((uxpte) >> UXPTE_PRESENT_BIT) +#define uxpte_present(uxpte) ((uxpte) & UXPTE_PRESENT_MASK) + +static inline long uxpte_read(struct uxpte_t *uxpte) +{ + return atomic64_read(&uxpte->val); +} + +static inline void uxpte_set(struct uxpte_t *uxpte, long val) +{ + atomic64_set(&uxpte->val, val); +} + +static inline bool uxpte_cas(struct uxpte_t *uxpte, long old, long new) +{ + return atomic64_cmpxchg(&uxpte->val, old, new) == old; +} + +void mm_init_uxpgd(struct mm_struct *mm) +{ + mm->uxpgd = NULL; + spin_lock_init(&mm->uxpgd_lock); +} + +void mm_clear_uxpgd(struct mm_struct *mm) +{ + struct page *page = NULL; + void **slot = NULL; + struct radix_tree_iter iter; + + spin_lock(&mm->uxpgd_lock); + if (!mm->uxpgd) + goto out; + radix_tree_for_each_slot(slot, mm->uxpgd, &iter, 0) { + page = radix_tree_delete(mm->uxpgd, iter.index); + put_page(page); + } +out: + kfree(mm->uxpgd); + mm->uxpgd = NULL; + spin_unlock(&mm->uxpgd_lock); +} + +/* should hold uxpgd_lock before invoke */ +static struct page *lookup_uxpte_page(struct vm_area_struct *vma, + unsigned long addr, bool alloc) +{ + struct radix_tree_root *uxpgd = NULL; + struct page *page = NULL; + struct folio *new_folio = NULL; + struct page *new_page = NULL; + struct mm_struct *mm = vma->vm_mm; + unsigned long uxpn = uxpte_pn(addr); + + if (mm->uxpgd) + goto lookup; + if (!alloc) + goto out; + spin_unlock(&mm->uxpgd_lock); + uxpgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); + if (!uxpgd) { + pr_err("uxpgd alloc failed.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + INIT_RADIX_TREE(uxpgd, GFP_KERNEL); + spin_lock(&mm->uxpgd_lock); + if (mm->uxpgd) + kfree(uxpgd); + else + mm->uxpgd = uxpgd; +lookup: + page = radix_tree_lookup(mm->uxpgd, uxpn); + if (page) + goto out; + if (!alloc) + goto out; + spin_unlock(&mm->uxpgd_lock); + new_folio = vma_alloc_zeroed_movable_folio(vma, addr); + if (!new_folio) { + pr_err("uxpte page alloc fail.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + new_page = &new_folio->page; + if (radix_tree_preload(GFP_KERNEL)) { + put_page(new_page); + pr_err("radix preload fail.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + spin_lock(&mm->uxpgd_lock); + page = radix_tree_lookup(mm->uxpgd, uxpn); + if (page) { + put_page(new_page); + } else { + page = new_page; + radix_tree_insert(mm->uxpgd, uxpn, page); + } + radix_tree_preload_end(); +out: + return page; +} + +/* should hold uxpgd_lock before invoke */ +static struct uxpte_t *lookup_uxpte(struct vm_area_struct *vma, + unsigned long addr, bool alloc) +{ + struct uxpte_t *uxpte = NULL; + struct page *page = NULL; + + page = lookup_uxpte_page(vma, addr, alloc); + if (!page) + return NULL; + uxpte = page_to_virt(page); + + return uxpte + uxpte_off(addr); +} + +bool lock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + struct uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, true); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (val >> 1) + goto unlock; + if (!uxpte_cas(uxpte, val, UXPTE_UNDER_RECLAIM)) + goto retry; + val = UXPTE_UNDER_RECLAIM; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + + return val == UXPTE_UNDER_RECLAIM; +} + +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + struct uxpte_t *uxpte = NULL; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, false); + if (!uxpte) + goto unlock; + uxpte_set(uxpte, 0); +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); +} + +bool uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) +{ + struct uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, true); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (val & 1) + goto unlock; + if (!uxpte_cas(uxpte, val, val + 1)) + goto retry; + val++; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + + return val & 1; +} + +void uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) +{ + struct uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, false); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (!(val & 1)) + goto unlock; + if (!uxpte_cas(uxpte, val, val - 1)) + goto retry; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); +} + +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, pte_t *entry) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long vma_uxpn = vma->vm_pgoff; + unsigned long off_uxpn = vpn(vmf->address - vma->vm_start); + unsigned long addr = uxpn2addr(vma_uxpn + off_uxpn); + struct page *page = NULL; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, addr, true); + spin_unlock(&vma->vm_mm->uxpgd_lock); + + if (!page) + return VM_FAULT_OOM; + + *entry = mk_pte(page, vma->vm_page_prot); + *entry = pte_sw_mkyoung(*entry); + if (vma->vm_flags & VM_WRITE) + *entry = pte_mkwrite(pte_mkdirty(*entry), vma); + return 0; +} + +static void __mm_purg_pages_info(struct mm_struct *mm, unsigned long *total_purg_pages, + unsigned long *pined_purg_pages) +{ + struct page *page = NULL; + void **slot = NULL; + struct radix_tree_iter iter; + struct uxpte_t *uxpte = NULL; + long pte_entry = 0; + int index = 0; + unsigned long nr_total = 0, nr_pined = 0; + + spin_lock(&mm->uxpgd_lock); + if (!mm->uxpgd) + goto out; + radix_tree_for_each_slot(slot, mm->uxpgd, &iter, 0) { + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + uxpte = page_to_virt(page); + for (index = 0; index < UXPTE_PER_PAGE; index++) { + pte_entry = uxpte_read(&(uxpte[index])); + if (uxpte_present(pte_entry) == 0) /* not present */ + continue; + nr_total++; + if (uxpte_refcnt(pte_entry) > 0) /* pined by user */ + nr_pined++; + } + } +out: + spin_unlock(&mm->uxpgd_lock); + + if (total_purg_pages) + *total_purg_pages = nr_total; + + if (pined_purg_pages) + *pined_purg_pages = nr_pined; +} + +void mm_purg_pages_info(struct mm_struct *mm, unsigned long *total_purg_pages, + unsigned long *pined_purg_pages) +{ + if (unlikely(!mm)) + return; + + if (!total_purg_pages && !pined_purg_pages) + return; + + __mm_purg_pages_info(mm, total_purg_pages, pined_purg_pages); +} + +void purg_pages_info(unsigned long *total_purg_pages, unsigned long *pined_purg_pages) +{ + struct task_struct *p = NULL; + struct task_struct *tsk = NULL; + unsigned long mm_nr_purge = 0, mm_nr_pined = 0; + unsigned long nr_total = 0, nr_pined = 0; + + if (!total_purg_pages && !pined_purg_pages) + return; + + if (total_purg_pages) + *total_purg_pages = 0; + + if (pined_purg_pages) + *pined_purg_pages = 0; + + rcu_read_lock(); + for_each_process(p) { + tsk = find_lock_task_mm(p); + if (!tsk) { + /* + * It is a kthread or all of p's threads have already + * detached their mm's. + */ + continue; + } + __mm_purg_pages_info(tsk->mm, &mm_nr_purge, &mm_nr_pined); + nr_total += mm_nr_purge; + nr_pined += mm_nr_pined; + task_unlock(tsk); + + if (mm_nr_purge > 0) { + pr_info("purgemm: tsk: %s %lu pined in %lu pages\n", tsk->comm ?: "NULL", + mm_nr_pined, mm_nr_purge); + } + } + rcu_read_unlock(); + if (total_purg_pages) + *total_purg_pages = nr_total; + + if (pined_purg_pages) + *pined_purg_pages = nr_pined; + pr_info("purgemm: Sum: %lu pined in %lu pages\n", nr_pined, nr_total); +} diff --git a/mm/purgeable_ashmem_trigger.c b/mm/purgeable_ashmem_trigger.c new file mode 100644 index 0000000000000000000000000000000000000000..73759333d645c4bbae9f6ccfa74aed4789166ca3 --- /dev/null +++ b/mm/purgeable_ashmem_trigger.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include "../drivers/staging/android/ashmem.h" + +#define PURGEABLE_ASHMEM_SHRINKALL_ARG 0 + +struct purgeable_ashmem_trigger_args { + struct seq_file *seq; + struct task_struct *tsk; +}; + +static int purgeable_ashmem_trigger_cb(const void *data, + struct file *f, unsigned int fd) +{ + const struct purgeable_ashmem_trigger_args *args = data; + struct task_struct *tsk = args->tsk; + struct purgeable_ashmem_metadata pmdata; + + if (!is_ashmem_file(f)) + return 0; + if (!get_purgeable_ashmem_metadata(f, &pmdata)) + return 0; + if (pmdata.is_purgeable) { + pmdata.name = pmdata.name == NULL ? "" : pmdata.name; + seq_printf(args->seq, + "%s,%u,%u,%ld,%s,%zu,%u,%u,%d,%d\n", + tsk->comm, tsk->pid, fd, (long)tsk->signal->oom_score_adj, + pmdata.name, pmdata.size, pmdata.id, pmdata.create_time, + pmdata.refc, pmdata.purged); + } + return 0; +} + +static ssize_t purgeable_ashmem_trigger_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char *buf; + unsigned int ashmem_id = 0; + unsigned int create_time = 0; + const unsigned int params_num = 2; + const struct cred *cred = current_cred(); + + if (!cred) + return 0; + + if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) && + !uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + pr_err("no permission to shrink purgeable ashmem!\n"); + return 0; + } + buf = memdup_user_nul(buffer, count); + buf = strstrip(buf); + if (sscanf(buf, "%u %u", &ashmem_id, &create_time) != params_num) + return -EINVAL; + if (ashmem_id == PURGEABLE_ASHMEM_SHRINKALL_ARG && + create_time == PURGEABLE_ASHMEM_SHRINKALL_ARG) + ashmem_shrinkall(); + else + ashmem_shrink_by_id(ashmem_id, create_time); + return count; +} + +static int purgeable_ashmem_trigger_show(struct seq_file *s, void *d) +{ + struct task_struct *tsk = NULL; + struct purgeable_ashmem_trigger_args cb_args; + const struct cred *cred = current_cred(); + + if (!cred) + return -EINVAL; + + if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) && + !uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + pr_err("no permission to shrink purgeable ashmem!\n"); + return -EINVAL; + } + seq_puts(s, "Process purgeable ashmem detail info:\n"); + seq_puts(s, "----------------------------------------------------\n"); + seq_printf(s, "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n", + "process_name", "pid", "adj", "fd", + "ashmem_name", "size", "id", "time", "ref_count", "purged"); + + ashmem_mutex_lock(); + rcu_read_lock(); + for_each_process(tsk) { + if (tsk->flags & PF_KTHREAD) + continue; + cb_args.seq = s; + cb_args.tsk = tsk; + + task_lock(tsk); + iterate_fd(tsk->files, 0, + purgeable_ashmem_trigger_cb, (void *)&cb_args); + task_unlock(tsk); + } + rcu_read_unlock(); + ashmem_mutex_unlock(); + seq_puts(s, "----------------------------------------------------\n"); + return 0; +} + +static int purgeable_ashmem_trigger_open(struct inode *inode, + struct file *file) +{ + return single_open(file, purgeable_ashmem_trigger_show, + inode->i_private); +} + +static const struct proc_ops purgeable_ashmem_trigger_fops = { + .proc_open = purgeable_ashmem_trigger_open, + .proc_write = purgeable_ashmem_trigger_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +void init_purgeable_ashmem_trigger(void) +{ + struct proc_dir_entry *entry = NULL; + + entry = proc_create_data("purgeable_ashmem_trigger", 0660, + NULL, &purgeable_ashmem_trigger_fops, NULL); + if (!entry) + pr_err("Failed to create purgeable ashmem trigger\n"); +} diff --git a/mm/rmap.c b/mm/rmap.c index 9f795b93cf40f5fa57c3dc38f7f18c4d4020d17d..d61242e91b121f7d7c0a2ce8e4af34f43491e2a7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -811,6 +812,10 @@ static bool folio_referenced_one(struct folio *folio, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; +#ifdef CONFIG_MEM_PURGEABLE + if (!(vma->vm_flags & VM_PURGEABLE)) + pra->vm_flags &= ~VM_PURGEABLE; +#endif if ((vma->vm_flags & VM_LOCKED) && (!folio_test_large(folio) || !pvmw.pte)) { /* Restore the mlock which got missed */ @@ -850,6 +855,9 @@ static bool folio_referenced_one(struct folio *folio, if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; +#ifdef CONFIG_MEM_PURGEABLE + pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; +#endif } if (!pra->mapcount) @@ -901,6 +909,9 @@ int folio_referenced(struct folio *folio, int is_locked, struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, +#ifdef CONFIG_MEM_PURGEABLE + .vm_flags = VM_PURGEABLE, +#endif }; struct rmap_walk_control rwc = { .rmap_one = folio_referenced_one, @@ -1522,6 +1533,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); +#ifdef CONFIG_MEM_PURGEABLE + if ((vma->vm_flags & VM_PURGEABLE) && !lock_uxpte(vma, address)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } +#endif /* * If the folio is in an mlock()d vma, we must not swap it out. */ @@ -1639,7 +1657,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } +#ifdef CONFIG_MEM_PURGEABLE + } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && + !userfaultfd_armed(vma))) { +#else } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { +#endif +#ifdef CONFIG_MEM_PURGEABLE + if (vma->vm_flags & VM_PURGEABLE) + unlock_uxpte(vma, address); +#endif + /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f48a713f020117e6d1c7111461084a217f3d1ce..0203fd116907ef63a35797b70eb09a99d8b3b71a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1445,6 +1445,7 @@ void folio_putback_lru(struct folio *folio) enum folio_references { FOLIOREF_RECLAIM, FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_RECLAIM_PURGEABLE, FOLIOREF_KEEP, FOLIOREF_ACTIVATE, }; @@ -1466,10 +1467,16 @@ static enum folio_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE; + /* rmap lock contention: rotate */ if (referenced_ptes == -1) return FOLIOREF_KEEP; +#ifdef CONFIG_MEM_PURGEABLE + if (vm_flags & VM_PURGEABLE) + return FOLIOREF_RECLAIM_PURGEABLE; +#endif + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1796,6 +1803,7 @@ unsigned int shrink_folio_list(struct list_head *folio_list, goto keep_locked; case FOLIOREF_RECLAIM: case FOLIOREF_RECLAIM_CLEAN: + case FOLIOREF_RECLAIM_PURGEABLE: ; /* try to reclaim the folio below */ } @@ -1816,7 +1824,7 @@ unsigned int shrink_folio_list(struct list_head *folio_list, * Lazyfree folio could be freed directly */ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { - if (!folio_test_swapcache(folio)) { + if (!folio_test_swapcache(folio) && references != FOLIOREF_RECLAIM_PURGEABLE) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) @@ -1898,7 +1906,7 @@ unsigned int shrink_folio_list(struct list_head *folio_list, goto activate_locked; mapping = folio_mapping(folio); - if (folio_test_dirty(folio)) { + if (folio_test_dirty(folio) && references != FOLIOREF_RECLAIM_PURGEABLE) { /* * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid @@ -2013,10 +2021,11 @@ unsigned int shrink_folio_list(struct list_head *folio_list, } } - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_anon(folio) && (!folio_test_swapbacked(folio) || references == FOLIOREF_RECLAIM_PURGEABLE)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; + /* * The folio has only one reference left, which is * from the isolation. After the caller puts the @@ -7942,6 +7951,10 @@ void __meminit kswapd_stop(int nid) pgdat_kswapd_unlock(pgdat); } +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static void __init purgeable_debugfs_init(void); +#endif + static int __init kswapd_init(void) { int nid; @@ -7949,6 +7962,9 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); +#ifdef CONFIG_MEM_PURGEABLE_DEBUG + purgeable_debugfs_init(); +#endif return 0; } @@ -8174,3 +8190,75 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); + +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static unsigned long purgeable_node(pg_data_t *pgdata, struct scan_control *sc) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr = 0; +#ifdef CONFIG_MEMCG + while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))) +#endif + { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata); + + shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc); + nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc); + } + + pr_info("reclaim %lu purgeable pages.\n", nr); + + return nr; +} + +static int purgeable(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY, + .may_deactivate = DEACTIVATE_ANON, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + int nid = 0; + const struct cred *cred = current_cred(); + if (!cred) + return 0; + + if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) && + !uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + pr_err("no permission to shrink purgeable heap!\n"); + return -EINVAL; + } + for_each_node_state(nid, N_MEMORY) + purgeable_node(NODE_DATA(nid), &sc); + return 0; +} + +static struct ctl_table ker_tab[] = { + { + .procname = "purgeable", + .mode = 0666, + .proc_handler = purgeable, + }, + {}, +}; + +static struct ctl_table_header *purgeable_header; + +static void __init purgeable_debugfs_init(void) +{ + purgeable_header = register_sysctl("kernel", ker_tab); + if (!purgeable_header) + pr_err("register purgeable sysctl table failed.\n"); +} + +static void __exit purgeable_debugfs_exit(void) +{ + unregister_sysctl_table(purgeable_header); +} +#endif /* CONFIG_MEM_PURGEABLE_DEBUG */ diff --git a/mm/vmstat.c b/mm/vmstat.c index dcbd443881f961b68e8d89ff9460fe80115af927..1195132d5ea1bd15cdd63040dda64c499b539248 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1172,6 +1172,10 @@ const char * const vmstat_text[] = { "nr_zone_active_anon", "nr_zone_inactive_file", "nr_zone_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_zone_inactive_purgeable", + "nr_zone_active_purgeable", +#endif "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", @@ -1199,6 +1203,10 @@ const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_inactive_purgeable", + "nr_active_purgeable", +#endif "nr_unevictable", "nr_slab_reclaimable", "nr_slab_unreclaimable",