diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e755f76f76bddc3d3f88cd6e51fd7503210af64b..53c15bb5b97713609ff8c55fe9454e6ad255cd14 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2449,7 +2449,7 @@ keepinitrd [HW,ARM] kernelcore= [KNL,X86,IA-64,PPC] - Format: nn[KMGTPE] | nn% | "mirror" + Format: nn[KMGTPE] | nn% | "mirror" | "reliable" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the @@ -2473,6 +2473,10 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. + Option "reliable" is base on option "mirror", but make + some extension. These two features are alternatives. + Current only arm64 is supported. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug @@ -5514,6 +5518,15 @@ [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. + reliable_debug= [ARM64] + Format: [P][,S][,F] + Only works with CONFIG_MEMORY_RELIABLE and + "kernelcore=reliable" is configured. + P: Page cache does not use the reliable memory. + S: The shmem does not use the reliable memory. + F: User memory allocation(special user task, tmpfs) will + not allocate memory from non-mirrored region if failed. + reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] Reserve I/O ports or memory so the kernel won't use diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 65e0556064d681830286870534651433034fc8fd..56d9ba24bd1472c1af32959ff9f76e888584dc21 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -163,6 +163,8 @@ usually fail with ESRCH. can be derived from smaps, but is faster and more convenient numa_maps An extension based on maps, showing the memory locality and binding policy as well as mem usage (in pages) of each mapping. + reliable Present with CONFIG_MEMORY_RELIABLE=y. Task reliable status + information ============= =============================================================== For example, to get the status information of a process, all you have to do is @@ -195,6 +197,7 @@ read the file /proc/PID/status:: VmPTE: 20 kb VmSwap: 0 kB HugetlbPages: 0 kB + Reliable: 1608 kB CoreDumping: 0 THP_enabled: 1 Threads: 1 @@ -278,6 +281,7 @@ It's slow but very precise. VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions + Reliable size of reliable memory used CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) THP_enabled process is allowed to use THP (returns 0 when @@ -674,6 +678,10 @@ Where: node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page size, in KB, that is backing the mapping up. +The /proc/pid/reliable is used to control user task's reliable status. +Task with this flag can only alloc memory from mirrored region. Global +init task's reliable flag can not be accessed. + 1.2 Kernel data --------------- @@ -1021,6 +1029,13 @@ Example output. You may not have all of these fields. DirectMap4k: 401152 kB DirectMap2M: 10008576 kB DirectMap1G: 24117248 kB + ReliableTotal: 8190696 kB + ReliableUsed: 252912 kB + ReliableTaskUsed: 108136 kB + ReliableBuddyMem: 7937784 kB + ReliableShmem: 840 kB + FileCache: 104944 kB + ReliableFileCache: 102688 kB MemTotal Total usable RAM (i.e. physical RAM minus a few reserved @@ -1185,6 +1200,21 @@ HugePages_Total, HugePages_Free, HugePages_Rsvd, HugePages_Surp, Hugepagesize, H DirectMap4k, DirectMap2M, DirectMap1G Breakdown of page table sizes used in the kernel's identity mapping of RAM +ReliableTotal + Total reliable memory size +ReliableUsed + The used amount of reliable memory +ReliableTaskUsed + Size of mirrored memory used by user task +ReliableBuddyMem + Size of unused mirrored memory in buddy system +ReliableShmem + Total reliable memory used by share memory +FileCache + Memory usage of page cache +ReliableFileCache + Reliable memory usage of page cache + vmallocinfo ~~~~~~~~~~~ diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index cbc8c379a4a5e2659cbbe941330bb9a7514d7daa..2f9621c36d839184d959f17e6954bb91e2f66936 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1141,6 +1141,7 @@ CONFIG_LRU_GEN=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y +CONFIG_MEMORY_RELIABLE=y # # Data Access Monitoring diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8bad7937173183eb08634c9526a4e90..70dca85a5861d9d379955b199e33788924dd1751 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,3 +34,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o +proc-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 243c15919e1839c0ed20641832c87ad9345fee56..e04b0126334f991775223b2b1149f9447ac712f2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2657,6 +2657,14 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, return d_splice_alias(inode, dentry); } +static bool proc_hide_pidents(const struct pid_entry *p) +{ + if (mem_reliable_hide_file(p->name)) + return true; + + return false; +} + static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, const struct pid_entry *p, @@ -2675,6 +2683,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir, for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; + if (proc_hide_pidents(p)) + continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { res = proc_pident_instantiate(dentry, task, p); break; @@ -2701,8 +2711,9 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, goto out; for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { - if (!proc_fill_cache(file, ctx, p->name, p->len, - proc_pident_instantiate, task, p)) + if (!proc_hide_pidents(p) && + !proc_fill_cache(file, ctx, p->name, p->len, + proc_pident_instantiate, task, p)) break; ctx->pos++; } @@ -3382,6 +3393,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_MEMORY_RELIABLE + REG("reliable", S_IRUGO|S_IWUSR, proc_reliable_operations), +#endif #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), @@ -3731,6 +3745,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_MEMORY_RELIABLE + REG("reliable", S_IRUGO|S_IWUSR, proc_reliable_operations), +#endif #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), diff --git a/fs/proc/mem_reliable.c b/fs/proc/mem_reliable.c new file mode 100644 index 0000000000000000000000000000000000000000..635efc6fd3602040f7892c01423467eafe40801a --- /dev/null +++ b/fs/proc/mem_reliable.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include "internal.h" + +static inline int reliable_check(struct task_struct *task, struct pid *pid) +{ + if (!mem_reliable_is_enabled()) + return -EACCES; + + if (is_global_init(task)) + return -EINVAL; + + if (!task->mm || (task->flags & PF_KTHREAD) || + (task->flags & PF_EXITING)) + return -EINVAL; + + return 0; +} + +static ssize_t reliable_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct pid *pid = proc_pid(file_inode(file)); + char buffer[PROC_NUMBUF]; + size_t len; + short val; + int err; + + if (!task) + return -ESRCH; + + err = reliable_check(task, pid); + if (err) { + put_task_struct(task); + return err; + } + + val = task->flags & PF_RELIABLE ? 1 : 0; + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%hd\n", val); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t reliable_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct pid *pid = proc_pid(file_inode(file)); + char buffer[PROC_NUMBUF]; + int val; + int err; + + if (!task) + return -ESRCH; + + err = reliable_check(task, pid); + if (err) + goto out; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &val); + if (err) + goto out; + if (val != 0 && val != 1) { + err = -EINVAL; + goto out; + } + + if (val == 1) + task->flags |= PF_RELIABLE; + else + task->flags &= ~PF_RELIABLE; + +out: + put_task_struct(task); + return err < 0 ? err : count; +} + +struct file_operations proc_reliable_operations = { + .read = reliable_read, + .write = reliable_write, + .llseek = generic_file_llseek, +}; + +bool mem_reliable_hide_file(const char *name) +{ + if (!mem_reliable_is_enabled() && + !strncmp("reliable", name, strlen("reliable"))) + return true; + + return false; +} diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 45af9a989d4040135a4fe18acc9b5ef055a2affc..8f8cd1c60e0422e3d83a46d799860a0f2045e319 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,6 +168,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) arch_report_meminfo(m); + reliable_report_meminfo(m); + return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fe12b057d077fb0b08c94d4191192ced622734f9..fac21bcba8a68c980daf7b66b590ea694c2e38db 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -77,6 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); + reliable_report_usage(m, mm); } #undef SEQ_PUT_DEC diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5b917e5b93509a1947fc4c626b5945aebdfdf2f4..83a75c7344c35cb918e91cf56a9d8fec32cff489 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -134,6 +134,12 @@ static inline enum zone_type gfp_zone(gfp_t flags) z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + +#ifdef CONFIG_MEMORY_RELIABLE + if (z == ZONE_MOVABLE && (flags & GFP_RELIABLE)) + return ZONE_NORMAL; +#endif + return z; } diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a58670c571050ad410e0dcf0718b6477292b..d88913d62431cae30796e6dbc6559a01692bef05 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,11 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -/* 0x200u unused */ +#ifdef CONFIG_MEMORY_RELIABLE +#define ___GFP_RELIABLE 0x200u +#else +#define ___GFP_RELIABLE 0 +#endif #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -248,6 +252,9 @@ typedef unsigned int __bitwise gfp_t; /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) +/* Alloc memory from mirrored region */ +#define __GFP_RELIABLE ((__force gfp_t)___GFP_RELIABLE) + /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -336,5 +343,6 @@ typedef unsigned int __bitwise gfp_t; #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) +#define GFP_RELIABLE __GFP_RELIABLE #endif /* __LINUX_GFP_TYPES_H */ diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h new file mode 100644 index 0000000000000000000000000000000000000000..15f69349a2a863fa25b59406f2ddb935c31bd8b4 --- /dev/null +++ b/include/linux/mem_reliable.h @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_MEM_RELIABLE__ +#define __MM_MEM_RELIABLE__ + +#ifdef CONFIG_MEMORY_RELIABLE + +#include +#include +#include +#include +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(mem_reliable); + +extern bool reliable_enabled; +extern struct file_operations proc_reliable_operations; +extern bool shmem_reliable; +extern bool reliable_allow_fallback; +extern bool pagecache_reliable; +extern struct percpu_counter pagecache_reliable_pages; +extern struct percpu_counter anon_reliable_pages; +extern struct percpu_counter shmem_reliable_pages; +extern unsigned long task_reliable_limit __read_mostly; +extern unsigned long shmem_reliable_limit __read_mostly; +extern unsigned long pagecache_reliable_limit __read_mostly; + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long mirrored_sz); +bool mem_reliable_status(void); +bool mem_reliable_hide_file(const char *name); +void shmem_reliable_init(void); +void reliable_lru_add(enum lru_list lru, struct folio *folio, int val); +void reliable_lru_add_batch(int zid, enum lru_list lru, int val); +bool mem_reliable_counter_initialized(void); +void reliable_report_meminfo(struct seq_file *m); +void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask); +void reliable_report_usage(struct seq_file *m, struct mm_struct *mm); + +static inline bool mem_reliable_is_enabled(void) +{ + return static_branch_likely(&mem_reliable); +} + +static inline bool page_reliable(struct page *page) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!page) + return false; + + return page_zonenum(page) < ZONE_MOVABLE; +} + +static inline bool folio_reliable(struct folio *folio) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!folio) + return false; + + return folio_zonenum(folio) < ZONE_MOVABLE; +} + +static inline bool shmem_reliable_is_enabled(void) +{ + return shmem_reliable; +} + +static inline bool filemap_reliable_is_enabled(void) +{ + return pagecache_reliable; +} + +static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!current->mm || (current->flags & PF_KTHREAD)) + return false; + + /* user tasks can only alloc memory from non-mirrored region */ + if (!(gfp & GFP_RELIABLE) && (gfp & __GFP_HIGHMEM) && + (gfp & __GFP_MOVABLE)) { + if (zonelist_zone_idx(z) < ZONE_MOVABLE) + return true; + } + + return false; +} + +static inline bool reliable_allow_fb_enabled(void) +{ + return reliable_allow_fallback; +} + +static inline bool mem_reliable_shmem_limit_check(void) +{ + return percpu_counter_read_positive(&shmem_reliable_pages) < + (shmem_reliable_limit >> PAGE_SHIFT); +} + +/* + * Check if this memory allocation for shmem is allowed. + * Return false if limit is triggered. + */ +static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) +{ + if (!mem_reliable_is_enabled()) + return true; + + if (!shmem_reliable_is_enabled()) { + *gfp_mask &= ~GFP_RELIABLE; + return true; + } + + if (mem_reliable_shmem_limit_check()) { + *gfp_mask |= GFP_RELIABLE; + return true; + } + + if (reliable_allow_fb_enabled()) + return true; + + return false; +} + +static inline void filemap_prepare_alloc(gfp_t *gfp_mask) +{ + s64 nr_reliable = 0; + + if (!mem_reliable_is_enabled()) + return; + + if (!filemap_reliable_is_enabled()) { + *gfp_mask &= ~GFP_RELIABLE; + return; + } + + nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages); + if (nr_reliable > pagecache_reliable_limit >> PAGE_SHIFT) { + *gfp_mask &= ~GFP_RELIABLE; + return; + } + + *gfp_mask |= GFP_RELIABLE; +} + +static inline unsigned long task_reliable_used_pages(void) +{ + s64 nr_pages; + + nr_pages = percpu_counter_read_positive(&pagecache_reliable_pages); + nr_pages += percpu_counter_read_positive(&anon_reliable_pages); + + return nr_pages; +} + +static inline void shmem_reliable_folio_add(struct folio *folio, int nr_page) +{ + if (shmem_reliable_is_enabled() && folio_reliable(folio)) + percpu_counter_add(&shmem_reliable_pages, nr_page); +} + + +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return (task_reliable_used_pages() + nr_page) <= + (task_reliable_limit >> PAGE_SHIFT); +} + +static inline bool mem_reliable_should_reclaim(void) +{ + if (percpu_counter_sum_positive(&pagecache_reliable_pages) >= + MAX_ORDER_NR_PAGES) + return true; + + return false; +} + +static inline void reliable_page_counter_inner(struct mm_struct *mm, int val) +{ + atomic_long_add(val, &mm->reliable_nr_page); + + /* + * Update reliable page counter to zero if underflows. + * + * Since reliable page counter is used for debug purpose only, + * there is no real function problem by doing this. + */ + if (unlikely(atomic_long_read(&mm->reliable_nr_page) < 0)) + atomic_long_set(&mm->reliable_nr_page, 0); +} + +static inline void add_reliable_folio_counter(struct folio *folio, + struct mm_struct *mm, int val) +{ + if (!folio_reliable(folio)) + return; + + reliable_page_counter_inner(mm, val); +} + +static inline void add_reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) +{ + if (!page_reliable(page)) + return; + + reliable_page_counter_inner(mm, val); +} +#else +#define reliable_enabled 0 + +static inline bool mem_reliable_is_enabled(void) { return false; } +static inline bool filemap_reliable_is_enabled(void) { return false; } +static inline void mem_reliable_init(bool has_unmirrored_mem, + unsigned long mirrored_sz) {} +static inline bool page_reliable(struct page *page) { return false; } +static inline bool folio_reliable(struct folio *folio) { return false; } +static inline bool skip_non_mirrored_zone(gfp_t gfp, struct zoneref *z) +{ + return false; +} +static inline bool mem_reliable_status(void) { return false; } +static inline bool mem_reliable_hide_file(const char *name) { return false; } +static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) { return true; } +static inline void filemap_prepare_alloc(gfp_t *gfp_mask) {} +static inline void shmem_reliable_init(void) {} +static inline void reliable_lru_add(enum lru_list lru, struct folio *folio, + int val) {} +static inline void reliable_lru_add_batch(int zid, enum lru_list lru, + int val) {} +static inline bool mem_reliable_counter_initialized(void) { return false; } +static inline void shmem_reliable_folio_add(struct folio *folio, + int nr_page) {} +static inline void reliable_report_meminfo(struct seq_file *m) {} +static inline bool mem_reliable_shmem_limit_check(void) { return true; } +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline bool mem_reliable_should_reclaim(void) { return false; } +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} +static inline bool reliable_allow_fb_enabled(void) { return false; } +static inline void add_reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) {} +static inline void add_reliable_folio_counter(struct folio *folio, + struct mm_struct *mm, int val) {} +static inline void reliable_report_usage(struct seq_file *m, + struct mm_struct *mm) {} +#endif + +#endif diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ae3bde302f704ad1c464efc4e3f6cf09f164c5d3..653c307f22c46c6b34a7b23e1fcc9ff418a9b50f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,6 +42,7 @@ extern unsigned long long max_possible_pfn; * kernel resource tree. * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are * not initialized (only for reserved regions). + * @MEMBLOCK_NOMIRROR: memory region for non-mirrored memory */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ @@ -50,6 +51,7 @@ enum memblock_flags { MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ + MEMBLOCK_NOMIRROR = 0x100, /* alloc from non-mirrored region */ }; /** @@ -428,6 +430,10 @@ void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); +void *memblock_alloc_try_nid_raw_flags(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid, + enum memblock_flags flags); static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f078aa6b493cf57ae1e7a8237163b8aa59ac3bfd..c00def598f9522ffb052fe3208a65847b306a20c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4093,4 +4093,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) #endif +/* added to mm.h to avoid every caller adding new header file */ +#include + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8148b30a9df10874967fc461948fbc08b1c2f629..57acfa854841a3d294463c5890722983deadf69a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -8,6 +8,7 @@ #include #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -195,6 +196,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; + reliable_lru_add(lru, folio, delta); __update_lru_size(lruvec, lru, zone, delta); return; } @@ -203,6 +205,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; + reliable_lru_add(lru, folio, -delta); __update_lru_size(lruvec, lru, zone, -delta); return; } @@ -317,6 +320,7 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) if (lru_gen_add_folio(lruvec, folio, false)) return; + reliable_lru_add(lru, folio, folio_nr_pages(folio)); update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) @@ -331,6 +335,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) if (lru_gen_add_folio(lruvec, folio, true)) return; + reliable_lru_add(lru, folio, folio_nr_pages(folio)); update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ @@ -347,6 +352,7 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); + reliable_lru_add(lru, folio, -folio_nr_pages(folio)); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0bc3c7c191a5cc66989ab0eb2b0d6b0d85e6dcc6..a077f60819d9e00cc715e406c933725b8b4f87d3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -936,6 +936,10 @@ struct mm_struct { #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_SHARE_POOL struct sp_group_master *sp_group_master; +#endif +#ifdef CONFIG_MEMORY_RELIABLE + /* total used reliable pages */ + atomic_long_t reliable_nr_page; #endif } __randomize_layout; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 57086c57b8e47494f8213d0f5ad9f098bb094971..d055148f47add3df21eedf0291892b70b66290a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -286,6 +286,11 @@ static inline bool is_file_lru(enum lru_list lru) return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } +static inline int is_anon_lru(enum lru_list lru) +{ + return (lru == LRU_INACTIVE_ANON || lru == LRU_ACTIVE_ANON); +} + static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fdd08aa96263bd2f78d9bc88d6ab834504f7c6b..97ad76e86ed6e3e22d266ef206c364492715c5cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1756,6 +1756,11 @@ extern struct pid *cad_pid; #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 +#ifdef CONFIG_MEMORY_RELIABLE +#define PF_RELIABLE PF__HOLE__00010000 /* Allocate from reliable memory */ +#else +#define PF_RELIABLE 0x00000000 +#endif #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 18d30581137a4dffcd0b6f6b05ab45fb3ac8e4b9..6e24b2fbc4458f4ad012e4916aa30d7120ed212a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -50,7 +50,8 @@ gfpflag_string(__GFP_RECLAIM), \ gfpflag_string(__GFP_DIRECT_RECLAIM), \ gfpflag_string(__GFP_KSWAPD_RECLAIM), \ - gfpflag_string(__GFP_ZEROTAGS) + gfpflag_string(__GFP_ZEROTAGS), \ + gfpflag_string(__GFP_RELIABLE) #ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3048589e2e8516e12a817875988bdc5986c6ad09..7899662639965e34e914dddeb57a7be77f7d8265 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,6 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); + add_reliable_folio_counter(new_folio, mm, folio_nr_pages(new_folio)); page_add_new_anon_rmap(new_page, vma, addr); folio_add_lru_vma(new_folio, vma); } else @@ -198,6 +199,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); + add_reliable_page_counter(old_page, mm, -1); page_remove_rmap(old_page, vma, false); if (!folio_mapped(old_folio)) folio_free_swap(old_folio); diff --git a/mm/Kconfig b/mm/Kconfig index c277bb069ab74b3d40dc233c34ca0138d653ca77..2df11b146c8402b69c6671f96df0f41c390c817d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1348,6 +1348,23 @@ config CLEAR_FREELIST_PAGE To enable this feature, kernel parameter "clear_freelist" also needs to be added. +config MEMORY_RELIABLE + bool "Support for memory reliable" + depends on ARM64 + default n + help + Memory reliable is based on mirror memory. It has the following + additional features: + a) normal user tasks never alloc memory from mirrored region; + b) special user tasks will allocate memory from mirrored region + by default; c) upper limit of mirrored region allcated for user + tasks, shmem and pagecache. + Special user tasks and shmem/pagecache can fallback to + non-mirrored region if you enable reliable fallback mechanism. + + To enable this function, mirrored memory is needed and + "kernelcore=reliable" need to be added in kernel parameters. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 6759053ed782e98c1fe6c10ba8fd480e9d97527d..e1a853e318565a01cc22f817f4db5cda58419b23 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -144,3 +144,4 @@ obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o +obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/filemap.c b/mm/filemap.c index 485dfc8b0f9c67682d1cff9939de15983348924d..571a5c5cd372a521e4e4ab9d0b72df703b3dba68 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -185,6 +185,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_reliable_folio_add(folio, -nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { @@ -830,10 +831,14 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) __lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) __lruvec_stat_add_folio(new, NR_FILE_PAGES); - if (folio_test_swapbacked(old)) + if (folio_test_swapbacked(old)) { __lruvec_stat_sub_folio(old, NR_SHMEM); - if (folio_test_swapbacked(new)) + shmem_reliable_folio_add(old, -folio_nr_pages(old)); + } + if (folio_test_swapbacked(new)) { __lruvec_stat_add_folio(new, NR_SHMEM); + shmem_reliable_folio_add(new, folio_nr_pages(new)); + } xas_unlock_irq(&xas); if (free_folio) free_folio(old); @@ -963,6 +968,8 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) int n; struct folio *folio; + filemap_prepare_alloc(&gfp); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a13418df11157aa9587bbfd89dcdb76c667dda17..65421d751a9d9456fafc52ec347ac2d130616833 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -704,6 +704,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + add_reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); @@ -1143,6 +1144,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EAGAIN; } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + add_reliable_page_counter(src_page, dst_mm, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -1687,6 +1689,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); + add_reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -2103,6 +2106,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); + add_reliable_page_counter(page, mm, -HPAGE_PMD_NR); page_remove_rmap(page, vma, true); put_page(page); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e05ab2cb7913b3e4dc70e9b95d7a72611e2eafb6..825d1a18d27db0d8631f84b380d50353ac408357 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,6 +3246,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } +static void *__init __alloc_bootmem_huge_page_inner(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, + int nid) +{ + if (!mem_reliable_is_enabled()) + return memblock_alloc_try_nid_raw(size, align, min_addr, + max_addr, nid); + + return memblock_alloc_try_nid_raw_flags(size, align, min_addr, max_addr, + nid, MEMBLOCK_NOMIRROR); +} + int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) @@ -3260,7 +3274,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = __alloc_bootmem_huge_page_inner(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; @@ -3268,7 +3282,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { - m = memblock_alloc_try_nid_raw( + m = __alloc_bootmem_huge_page_inner( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 128d854dad8e914f798d1c6832801809bb9fd7e0..6f2787d3b682072aa74cd712d4f31240c76485b3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -95,6 +95,8 @@ static struct kmem_cache *mm_slot_cache __read_mostly; struct collapse_control { bool is_khugepaged; + /* alloc hugepage from reliable zone */ + bool reliable; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; @@ -715,6 +717,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); + add_reliable_page_counter(src_page, vma->vm_mm, 1); page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); @@ -825,6 +828,7 @@ static void khugepaged_alloc_sleep(void) struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, + .reliable = false, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) @@ -1063,6 +1067,9 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, int node = hpage_collapse_find_target_node(cc); struct folio *folio; + if (cc->reliable) + gfp |= GFP_RELIABLE; + if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) return SCAN_ALLOC_HUGE_PAGE_FAIL; @@ -1218,6 +1225,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); + add_reliable_page_counter(hpage, vma->vm_mm, HPAGE_PMD_NR); page_add_new_anon_rmap(hpage, vma, address); lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); @@ -1260,6 +1268,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + cc->reliable = false; pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte) { result = SCAN_PMD_NULL; @@ -1384,6 +1393,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; + + if (page_reliable(page)) + cc->reliable = true; } if (!writable) { result = SCAN_PAGE_RO; @@ -1621,6 +1633,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, */ ptep_clear(mm, addr, pte); page_remove_rmap(page, vma, false); + add_reliable_page_counter(page, mm, -1); nr_ptes++; } @@ -2227,6 +2240,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + cc->reliable = false; rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) @@ -2294,6 +2308,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, xas_pause(&xas); cond_resched_rcu(); } + + if (page_reliable(page)) + cc->reliable = true; } rcu_read_unlock(); @@ -2739,6 +2756,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + cc->reliable = false; if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); diff --git a/mm/ksm.c b/mm/ksm.c index 981af9c72e7a3ea754f374007a828de97d22ff93..7401a6c87a4bee9b1fa39a4b61f62c0e3838a175 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1233,6 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); + add_reliable_page_counter(kpage, mm, 1); page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { @@ -1262,6 +1263,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + add_reliable_page_counter(page, mm, -1); folio = page_folio(page); page_remove_rmap(page, vma, false); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c new file mode 100644 index 0000000000000000000000000000000000000000..c28b4ebe71c9db2c345cede5a0ca7c018b9a6c93 --- /dev/null +++ b/mm/mem_reliable.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem reliable: " fmt + +#include +#include +#include +#include +#include +#include + +#define PAGES_TO_B(n_pages) ((n_pages) << PAGE_SHIFT) + +enum mem_reliable_types { + MEM_RELIABLE_ALL, + MEM_RELIABLE_FALLBACK, + MEM_RELIABLE_SHMEM, + MEM_RELIABLE_PAGECACHE, + MEM_RELIABLE_MAX +}; + +DEFINE_STATIC_KEY_FALSE(mem_reliable); +EXPORT_SYMBOL_GPL(mem_reliable); + +bool reliable_enabled; +bool shmem_reliable __read_mostly = true; +bool pagecache_reliable __read_mostly = true; +struct percpu_counter pagecache_reliable_pages; +struct percpu_counter anon_reliable_pages; +struct percpu_counter shmem_reliable_pages; +unsigned long pagecache_reliable_limit = ULONG_MAX; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX; +unsigned long shmem_reliable_limit = ULONG_MAX; +bool reliable_allow_fallback __read_mostly = true; + +bool mem_reliable_counter_initialized(void) +{ + return likely(percpu_counter_initialized(&pagecache_reliable_pages)) && + likely((percpu_counter_initialized(&anon_reliable_pages))); +} + +bool mem_reliable_status(void) +{ + return mem_reliable_is_enabled(); +} +EXPORT_SYMBOL_GPL(mem_reliable_status); + +static unsigned long total_reliable_pages(void) +{ + unsigned long total_reliable_pages = 0; + struct zone *z; + + for_each_populated_zone(z) + if (zone_idx(z) < ZONE_MOVABLE) + total_reliable_pages += zone_managed_pages(z); + + return total_reliable_pages; +} + +static unsigned long free_reliable_pages(void) +{ + struct zone *zone; + unsigned long cnt = 0; + + for_each_populated_zone(zone) + if (zone_idx(zone) < ZONE_MOVABLE) + cnt += zone_page_state(zone, NR_FREE_PAGES); + + return cnt; +} + +static unsigned long used_reliable_pages(void) +{ + return total_reliable_pages() - free_reliable_pages(); +} + +void mem_reliable_init(bool has_unmirrored_mem, unsigned long mirrored_sz) +{ + if (!reliable_enabled) + return; + + if (!has_unmirrored_mem) { + pr_err("init failed, unmirrored memory size is zero.\n"); + return; + } + + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)\n", mirrored_sz); +} + +void shmem_reliable_init(void) +{ + if (!mem_reliable_is_enabled() || !shmem_reliable_is_enabled()) { + shmem_reliable = false; + return; + } + + percpu_counter_init(&shmem_reliable_pages, 0, GFP_KERNEL); +} + +void reliable_lru_add_batch(int zid, enum lru_list lru, int val) +{ + if (!mem_reliable_is_enabled()) + return; + + if (zid < ZONE_MOVABLE) { + if (is_file_lru(lru)) + percpu_counter_add(&pagecache_reliable_pages, val); + else if (is_anon_lru(lru)) + percpu_counter_add(&anon_reliable_pages, val); + } +} + +void reliable_lru_add(enum lru_list lru, struct folio *folio, int val) +{ + if (!folio_reliable(folio)) + return; + + if (is_file_lru(lru)) + percpu_counter_add(&pagecache_reliable_pages, val); + else if (is_anon_lru(lru)) + percpu_counter_add(&anon_reliable_pages, val); + else if (lru == LRU_UNEVICTABLE) { + if (folio_test_anon(folio)) + percpu_counter_add(&anon_reliable_pages, val); + else + percpu_counter_add(&pagecache_reliable_pages, val); + } +} + +#ifdef CONFIG_SYSCTL +static int reliable_pagecache_max_bytes_write(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + unsigned long old_value = pagecache_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (pagecache_reliable_limit > + PAGES_TO_B(total_reliable_pages())) { + pagecache_reliable_limit = old_value; + return -EINVAL; + } + } + + return ret; +} + +static int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (task_reliable_limit > PAGES_TO_B(total_reliable_pages()) || + task_reliable_limit < + (task_reliable_used_pages() << PAGE_SHIFT)) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + +#ifdef CONFIG_SHMEM +static int reliable_shmem_bytes_limit_handler(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + unsigned long *data_ptr = (unsigned long *)(table->data); + unsigned long old = *data_ptr; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (*data_ptr > PAGES_TO_B(total_reliable_pages())) { + *data_ptr = old; + return -EINVAL; + } + } + + return ret; +} +#endif + +static void mem_reliable_feature_disable(int idx); + +#define CTRL_BITS_SHIFT MEM_RELIABLE_MAX +#define CTRL_BITS_MASK ((1 << CTRL_BITS_SHIFT) - 1) + +static unsigned long mem_reliable_ctrl_bits = CTRL_BITS_MASK; + +static void mem_reliable_ctrl_bit_disable(int idx) +{ + clear_bit(idx, &mem_reliable_ctrl_bits); +} + +static bool mem_reliable_ctrl_bit_is_enabled(int idx) +{ + return !!test_bit(idx, &mem_reliable_ctrl_bits); +} + +static void mem_reliable_handle_ctrl_bits(unsigned long ctrl_bits) +{ + bool status; + int i; + + for (i = MEM_RELIABLE_FALLBACK; i < MEM_RELIABLE_MAX; i++) { + status = !!test_bit(i, &ctrl_bits); + + if (mem_reliable_ctrl_bit_is_enabled(i) && !status) + mem_reliable_feature_disable(i); + } +} + +static void mem_reliable_disable_all(void) +{ + mem_reliable_ctrl_bits = 0; + + reliable_allow_fallback = false; + shmem_reliable = false; + pagecache_reliable = false; + static_branch_disable(&mem_reliable); + + pr_info("memory reliable feature disabled.\n"); +} + +static int reliable_debug_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + unsigned long old_ctrl_bits, new_ctrl_bits; + static DEFINE_MUTEX(reliable_debug_mutex); + int ret; + + mutex_lock(&reliable_debug_mutex); + old_ctrl_bits = mem_reliable_ctrl_bits; + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (!mem_reliable_is_enabled() || + (mem_reliable_ctrl_bits > (1 << CTRL_BITS_SHIFT) - 1)) { + mem_reliable_ctrl_bits = old_ctrl_bits; + mutex_unlock(&reliable_debug_mutex); + + return -EINVAL; + } + + new_ctrl_bits = mem_reliable_ctrl_bits; + mem_reliable_ctrl_bits = old_ctrl_bits; + if (!!test_bit(MEM_RELIABLE_ALL, &new_ctrl_bits)) + mem_reliable_handle_ctrl_bits(new_ctrl_bits); + else + mem_reliable_disable_all(); + } + + mutex_unlock(&reliable_debug_mutex); + + return ret; +} + +static struct ctl_table reliable_ctl_table[] = { + { + .procname = "reliable_pagecache_max_bytes", + .data = &pagecache_reliable_limit, + .maxlen = sizeof(pagecache_reliable_limit), + .mode = 0644, + .proc_handler = reliable_pagecache_max_bytes_write, + }, + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, +#ifdef CONFIG_SHMEM + { + .procname = "shmem_reliable_bytes_limit", + .data = &shmem_reliable_limit, + .maxlen = sizeof(shmem_reliable_limit), + .mode = 0644, + .proc_handler = reliable_shmem_bytes_limit_handler, + }, +#endif + { + .procname = "reliable_debug", + .data = &mem_reliable_ctrl_bits, + .maxlen = sizeof(mem_reliable_ctrl_bits), + .mode = 0600, + .proc_handler = reliable_debug_handler, + }, + {} +}; + +static int __init reliable_sysctl_init(void) +{ + if (!mem_reliable_is_enabled()) + return 0; + + if (!register_sysctl("vm", reliable_ctl_table)) { + pr_err("register sysctl failed."); + return -ENOMEM; + } + + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); + percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL); + + return 0; +} +arch_initcall(reliable_sysctl_init); +#else +static void mem_reliable_ctrl_bit_disabled(int idx) {} +#endif + +static void mem_reliable_feature_disable(int idx) +{ + char *str = NULL; + + switch (idx) { + case MEM_RELIABLE_FALLBACK: + reliable_allow_fallback = false; + str = "fallback"; + break; + case MEM_RELIABLE_SHMEM: + shmem_reliable = false; + str = "shmem"; + break; + case MEM_RELIABLE_PAGECACHE: + pagecache_reliable = false; + str = "pagecache"; + break; + default: + pr_err("unknown index: %d", idx); + return; + } + + mem_reliable_ctrl_bit_disable(idx); + pr_info("%s is disabled\n", str); +} + +#define TO_KB(bytes) ((bytes) >> 10) +#define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10)) + +#define SEQ_printf(m, x...) \ +do { \ + if (m) \ + seq_printf(m, x); \ + else \ + pr_info(x); \ +} while (0) + +static void reliable_show_limits(void) +{ + SEQ_printf(NULL, "ReliableTaskLimit: %lu kB\n", + TO_KB(task_reliable_limit)); + + if (shmem_reliable_is_enabled()) + SEQ_printf(NULL, "ReliableShmemLimit: %lu kB\n", + TO_KB(shmem_reliable_limit)); + + if (filemap_reliable_is_enabled()) + SEQ_printf(NULL, "ReliableFileCacheLimit: %lu kB\n", + TO_KB(pagecache_reliable_limit)); +} + +void reliable_report_meminfo(struct seq_file *m) +{ + if (!mem_reliable_is_enabled()) + return; + + SEQ_printf(m, "ReliableTotal: %8lu kB\n", + PAGES_TO_KB(total_reliable_pages())); + SEQ_printf(m, "ReliableUsed: %8lu kB\n", + PAGES_TO_KB(used_reliable_pages())); + SEQ_printf(m, "ReliableTaskUsed: %8lu kB\n", + PAGES_TO_KB(task_reliable_used_pages())); + SEQ_printf(m, "ReliableBuddyMem: %8lu kB\n", + PAGES_TO_KB(free_reliable_pages())); + + if (shmem_reliable_is_enabled()) { + unsigned long shmem_pages = (unsigned long)percpu_counter_sum( + &shmem_reliable_pages); + SEQ_printf(m, "ReliableShmem: %8lu kB\n", + PAGES_TO_KB(shmem_pages)); + } + + if (filemap_reliable_is_enabled()) { + unsigned long nr_reliable_pages = 0; + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + SEQ_printf(m, "FileCache: %8lu kB\n", PAGES_TO_KB(num)); + + nr_reliable_pages = + percpu_counter_sum_positive(&pagecache_reliable_pages); + SEQ_printf(m, "ReliableFileCache: %8lu kB\n", + PAGES_TO_KB(nr_reliable_pages)); + } + + /* show limit info during oom */ + if (!m) + reliable_show_limits(); +} + +void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} + +static int __init setup_reliable_debug(char *str) +{ + if (*str++ != '=' || !*str) + /* + * No options specified. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (*str) { + case 'P': + mem_reliable_feature_disable(MEM_RELIABLE_PAGECACHE); + break; + case 'S': + mem_reliable_feature_disable(MEM_RELIABLE_SHMEM); + break; + case 'F': + mem_reliable_feature_disable(MEM_RELIABLE_FALLBACK); + break; + default: + pr_err("reliable_debug option '%c' unknown. skipped\n", + *str); + } + } + +out: + return 1; +} +__setup("reliable_debug", setup_reliable_debug); + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) +void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + if (!mem_reliable_is_enabled()) + return; + + SEQ_PUT_DEC("Reliable:\t", atomic_long_read(&mm->reliable_nr_page)); + seq_puts(m, " kB\n"); +} diff --git a/mm/memblock.c b/mm/memblock.c index fd492e5bbdbcde42497a513fded38048bce91bde..e18a25f6ce040462473215c0d516ce947807f14e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1038,6 +1038,10 @@ static bool should_skip_region(struct memblock_type *type, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) return true; + /* skip mirror memory regions with MEMBLOCK_NOMIRROR */ + if ((flags & MEMBLOCK_NOMIRROR) && memblock_is_mirror(m)) + return true; + /* skip nomap memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; @@ -1381,13 +1385,14 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** - * memblock_alloc_range_nid - allocate boot memory block + * memblock_alloc_range_nid_flags - allocate boot memory block with specify flag * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @start: the lower bound of the memory region to allocate (phys address) * @end: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @exact_nid: control the allocation fall back to other nodes + * @flags: alloc memory from specify memblock flag * * The allocation is performed from memory region limited by * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. @@ -1395,22 +1400,18 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * If the specified node can not hold the requested memory and @exact_nid * is false, the allocation falls back to any node in the system. * - * For systems with memory mirroring, the allocation is attempted first - * from the regions with mirroring enabled and then retried from any - * memory region. - * - * In addition, function using kmemleak_alloc_phys for allocated boot - * memory block, it is never reported as leaks. + * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for + * allocated boot memory block, so that it is never reported as leaks. * * Return: * Physical address of allocated memory block on success, %0 on failure. */ -phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, +phys_addr_t __init memblock_alloc_range_nid_flags(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, - bool exact_nid) + bool exact_nid, + enum memblock_flags flags) { - enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) @@ -1471,6 +1472,41 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, return found; } +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. + * + * For systems with memory mirroring, the allocation is attempted first + * from the regions with mirroring enabled and then retried from any + * memory region. + * + * In addition, function using kmemleak_alloc_phys for allocated boot + * memory block, it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ +phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid, + bool exact_nid) +{ + return memblock_alloc_range_nid_flags(size, align, start, end, nid, + exact_nid, + choose_memblock_flags()); +} + /** * memblock_phys_alloc_range - allocate a memory block inside specified range * @size: size of memory block to be allocated in bytes @@ -1522,6 +1558,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @exact_nid: control the allocation fall back to other nodes + * @flags: alloc memory from specify memblock flag * * Allocates memory block using memblock_alloc_range_nid() and * converts the returned physical address to virtual. @@ -1534,10 +1571,11 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -static void * __init memblock_alloc_internal( +static void * __init __memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, - int nid, bool exact_nid) + int nid, bool exact_nid, + enum memblock_flags flags) { phys_addr_t alloc; @@ -1552,13 +1590,13 @@ static void * __init memblock_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid, - exact_nid); + alloc = memblock_alloc_range_nid_flags(size, align, min_addr, max_addr, nid, + exact_nid, flags); /* retry allocation without lower limit */ if (!alloc && min_addr) - alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid, - exact_nid); + alloc = memblock_alloc_range_nid_flags(size, align, 0, max_addr, nid, + exact_nid, flags); if (!alloc) return NULL; @@ -1566,6 +1604,15 @@ static void * __init memblock_alloc_internal( return phys_to_virt(alloc); } +static void * __init memblock_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid, bool exact_nid) +{ + return __memblock_alloc_internal(size, align, min_addr, max_addr, nid, + exact_nid, choose_memblock_flags()); +} + /** * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node * without zeroing memory @@ -1629,6 +1676,19 @@ void * __init memblock_alloc_try_nid_raw( false); } +void * __init memblock_alloc_try_nid_raw_flags( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid, enum memblock_flags flags) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + return __memblock_alloc_internal(size, align, min_addr, max_addr, nid, + false, flags); +} + /** * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes diff --git a/mm/memory.c b/mm/memory.c index 6569c9e97c9d9a77bf46ad955be501ad4997969a..944c2ce2756b11b98873a255a801ff5758c1373a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -963,6 +963,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, folio_get(folio); page_dup_file_rmap(page, false); rss[mm_counter_file(page)]++; + add_reliable_folio_counter(folio, dst_vma->vm_mm, 1); } /* @@ -1463,6 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; + add_reliable_page_counter(page, mm, -1); if (!delay_rmap) { page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) @@ -1490,6 +1492,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(page)]--; + add_reliable_page_counter(page, mm, -1); if (is_device_private_entry(entry)) page_remove_rmap(page, vma, false); put_page(page); @@ -3166,10 +3169,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) dec_mm_counter(mm, mm_counter_file(&old_folio->page)); inc_mm_counter(mm, MM_ANONPAGES); } + add_reliable_folio_counter(old_folio, mm, -1); } else { ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } + + add_reliable_folio_counter(new_folio, mm, 1); flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); @@ -4023,6 +4029,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_free_swap(folio); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + add_reliable_folio_counter(folio, vma->vm_mm, 1); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); @@ -4198,6 +4205,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + add_reliable_folio_counter(folio, vma->vm_mm, 1); folio_add_new_anon_rmap(folio, vma, vmf->address); folio_add_lru_vma(folio, vma); setpte: @@ -4340,6 +4348,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + add_reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); page_add_file_rmap(page, vma, true); /* @@ -4396,6 +4405,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (unlikely(uffd_wp)) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ + add_reliable_folio_counter(folio, vma->vm_mm, nr); if (write && !(vma->vm_flags & VM_SHARED)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); VM_BUG_ON_FOLIO(nr != 1, folio); diff --git a/mm/migrate.c b/mm/migrate.c index 5aab4994c4b592e2f40178684f1d3ee256647b55..322c63e6f9be0df3d4a3b43d0300e7e3d423ba0c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -465,6 +465,11 @@ int folio_migrate_mapping(struct address_space *mapping, xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { + shmem_reliable_folio_add(folio, -nr); + shmem_reliable_folio_add(newfolio, nr); + } + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8ac1f79f754a290445a9a40c0e71f7e5b5853491..2e5bce2f1cb9eb249c174f02c974c6a53b5f0ddb 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -652,6 +652,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); + add_reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, vma, addr); if (!is_zone_device_page(page)) lru_cache_add_inactive_or_unevictable(page, vma); diff --git a/mm/mm_init.c b/mm/mm_init.c index fed4370b02e1ed51171c19e5179b6b8d90f9b6d4..0a3c20a003187665758beece2b57fcf5d0ee779a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "internal.h" #include "slab.h" #include "shuffle.h" @@ -268,9 +269,28 @@ static int __init cmdline_parse_kernelcore(char *p) { /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) { + if (reliable_enabled) { + pr_warn("kernelcore=reliable and kernelcore=mirror are alternative.\n"); + return -EINVAL; + } + + mirrored_kernelcore = true; + return 0; + } + +#ifdef CONFIG_MEMORY_RELIABLE + /* parse kernelcore=reliable */ + if (parse_option_str(p, "reliable")) { + if (!reliable_enabled && mirrored_kernelcore) { + pr_warn("kernelcore=mirror and kernelcore=reliable are alternative.\n"); + return -EINVAL; + } + + reliable_enabled = true; mirrored_kernelcore = true; return 0; } +#endif return cmdline_parse_core(p, &required_kernelcore, &required_kernelcore_percent); @@ -375,15 +395,24 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + bool has_unmirrored_mem = false; + unsigned long mirrored_sz = 0; if (!memblock_has_mirror()) { pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n"); goto out; } + if (is_kdump_kernel()) { + pr_warn("The system is under kdump, ignore kernelcore=mirror.\n"); + goto out; + } + for_each_mem_region(r) { - if (memblock_is_mirror(r)) + if (memblock_is_mirror(r)) { + mirrored_sz += r->size; continue; + } nid = memblock_get_region_node(r); @@ -394,6 +423,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) continue; } + has_unmirrored_mem = true; zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -402,6 +432,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.\n"); + mem_reliable_init(has_unmirrored_mem, mirrored_sz); + goto out2; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5b61c1060d19b8b2c046e0ac619644b0f435a17..8118695b959b3e1c56e91ba21be6f4ee0d671fd0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3189,6 +3189,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct page *page; unsigned long mark; + /* skip non-mirrored zone for normal user tasks */ + if (skip_non_mirrored_zone(gfp_mask, z)) + continue; + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !__cpuset_zone_allowed(zone, gfp_mask)) @@ -4037,6 +4041,52 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +#ifdef CONFIG_MEMORY_RELIABLE +/* + * if fallback is enabled, fallback to movable zone if no dma/normal zone + * found + */ +static inline struct zone *mem_reliable_fallback_zone(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return NULL; + + if (!(gfp_mask & GFP_RELIABLE)) + return NULL; + + ac->highest_zoneidx = gfp_zone(gfp_mask & ~GFP_RELIABLE); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->highest_zoneidx, ac->nodemask); + return ac->preferred_zoneref->zone; +} + +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return; + + if (gfp_mask & __GFP_NOFAIL) + return; + + if ((ac->highest_zoneidx == ZONE_NORMAL) && (gfp_mask & GFP_RELIABLE)) { + ac->highest_zoneidx = gfp_zone(gfp_mask & ~GFP_RELIABLE); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->highest_zoneidx, ac->nodemask); + return; + } +} +#else +static inline struct zone *mem_reliable_fallback_zone(gfp_t gfp_mask, + struct alloc_context *ac) +{ + return NULL; +} +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) {} +#endif + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4076,8 +4126,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) - goto nopage; + if (!ac->preferred_zoneref->zone) { + if (!mem_reliable_fallback_zone(gfp_mask, ac)) + goto nopage; + } /* * Check for insane configurations where the cpuset doesn't contain @@ -4095,6 +4147,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); + mem_reliable_fallback_slowpath(gfp_mask, ac); + /* * The adjusted alloc_flags might result in immediate success, so try * that first @@ -4526,6 +4580,116 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, } EXPORT_SYMBOL_GPL(__alloc_pages_bulk); +static inline void prepare_before_alloc(gfp_t *gfp_mask) +{ + bool zone_movable; + + if (!mem_reliable_is_enabled()) + return; + + /* + * memory reliable only handle memory allocation from movable zone + * (force alloc from non-movable zone or force alloc from movable + * zone) to get total isolation. + */ + zone_movable = gfp_zone(*gfp_mask & ~GFP_RELIABLE) == ZONE_MOVABLE; + if (!zone_movable) + goto clear_flag; + + if (!in_task()) + return; + + if ((current->flags & PF_RELIABLE) || is_global_init(current)) + *gfp_mask |= GFP_RELIABLE; + + return; +clear_flag: + *gfp_mask &= ~GFP_RELIABLE; +} + +static inline long mem_reliable_direct_reclaim(int nr_pages, struct alloc_context *ac) +{ + long nr_reclaimed = 0; + + while (nr_reclaimed < nr_pages) { + /* try to free cache from reliable region */ + long progress = __perform_reclaim(GFP_KERNEL, 0, ac); + + nr_reclaimed += progress; + if (progress < SWAP_CLUSTER_MAX) + break; + } + + return nr_reclaimed; +} + +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, + int preferred_nid, + struct alloc_context *ac, + struct page **_page) +{ + int retry_times = MAX_RECLAIM_RETRIES; + int nr_pages; + + if (!mem_reliable_is_enabled()) + return false; + + if (!(*gfp & GFP_RELIABLE)) + return false; + + if (!*_page) + goto out_retry; + + if (*gfp & __GFP_NOFAIL || current->flags & PF_MEMALLOC) + goto out; + + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; + +limit_check: + /* user task is limited by task_reliable_limit */ + if (!reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + if (mem_reliable_should_reclaim() && retry_times--) { + nr_pages = mem_reliable_direct_reclaim(1 << order, ac); + if (nr_pages) + goto limit_check; + } + + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (reliable_allow_fb_enabled() || is_global_init(current)) { + *gfp &= ~GFP_RELIABLE; + return true; + } + + if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp, order, preferred_nid, ac->nodemask); +out: + return false; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -4545,6 +4709,10 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, return NULL; gfp &= gfp_allowed_mask; + + prepare_before_alloc(&gfp); + +retry: /* * Apply scoped allocation constraints. This is mainly about GFP_NOFS * resp. GFP_NOIO which has to be inherited for all allocation requests @@ -4587,6 +4755,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = NULL; } + if (check_after_alloc(&gfp, order, preferred_nid, &ac, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); diff --git a/mm/rmap.c b/mm/rmap.c index 9f795b93cf40f5fa57c3dc38f7f18c4d4020d17d..93ea81fe51800e494861d1c7cc41ed521faf98d1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1636,6 +1636,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, hsz); } else { dec_mm_counter(mm, mm_counter(&folio->page)); + add_reliable_page_counter(&folio->page, mm, -1); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1651,6 +1652,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); + add_reliable_page_counter(&folio->page, mm, -1); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; @@ -1693,6 +1695,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (ref_count == 1 + map_count && !folio_test_dirty(folio)) { dec_mm_counter(mm, MM_ANONPAGES); + add_reliable_folio_counter(folio, mm, -1); goto discard; } @@ -1737,6 +1740,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); + add_reliable_folio_counter(folio, mm, -1); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) @@ -1759,6 +1763,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(&folio->page)); + add_reliable_folio_counter(folio, mm, -1); } discard: page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); @@ -2033,6 +2038,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, hsz); } else { dec_mm_counter(mm, mm_counter(&folio->page)); + add_reliable_page_counter(&folio->page, mm, -1); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -2048,6 +2054,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); + add_reliable_page_counter(&folio->page, mm, -1); } else { swp_entry_t entry; pte_t swp_pte; diff --git a/mm/shmem.c b/mm/shmem.c index 69595d3418829f08d05829f3ad5166b7421a30fc..b44bfad90f8de64d8939652846fa532d382db45f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -808,6 +808,7 @@ static int shmem_add_to_page_cache(struct folio *folio, mapping->nrpages += nr; __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_reliable_folio_add(folio, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -839,6 +840,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) mapping->nrpages -= nr; __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_reliable_folio_add(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put(folio); BUG_ON(error); @@ -1677,6 +1679,9 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, if (err) goto failed; + if (!shmem_prepare_alloc(&gfp)) + goto no_mem; + if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); else @@ -1687,6 +1692,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, return folio; } +no_mem: err = -ENOMEM; shmem_inode_unacct_blocks(inode, nr); failed: @@ -1754,8 +1760,10 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, mem_cgroup_migrate(old, new); __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + shmem_reliable_folio_add(new, 1); __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); __lruvec_stat_mod_folio(old, NR_SHMEM, -1); + shmem_reliable_folio_add(old, -1); } xa_unlock_irq(&swap_mapping->i_pages); @@ -4625,6 +4633,9 @@ void __init shmem_init(void) else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif + + shmem_reliable_init(); + return; out1: diff --git a/mm/show_mem.c b/mm/show_mem.c index 4b888b18bddea98a1952e54e8118f104a7461c4a..5604925fb0b42515183efe11e33a09857aaa7b18 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -426,4 +426,5 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + reliable_report_meminfo(NULL); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 68859289f19e38795c9df1cb5dfd8fed7036c583..f103c83cc064fe0ba831b7d7e479b5414a27f338 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1882,6 +1882,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + add_reliable_page_counter(page, vma->vm_mm, 1); get_page(page); if (page == swapcache) { rmap_t rmap_flags = RMAP_NONE; diff --git a/mm/userswap.c b/mm/userswap.c index 18c99c2a0fc752b83d1dd5fb49111bfc079eb322..e76e9d7a40de030ad07461835789dba3780a09b3 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -162,6 +162,7 @@ static int uswap_unmap_anon_page(struct mm_struct *mm, SWP_USERSWAP_ENTRY, page_to_pfn(page)))); dec_mm_counter(mm, MM_ANONPAGES); + add_reliable_page_counter(page, mm, -1); page_remove_rmap(page, vma, false); page->mapping = NULL; @@ -192,6 +193,7 @@ static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, } inc_mm_counter(mm, MM_ANONPAGES); + add_reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, vma, addr); dst_pte = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -217,6 +219,7 @@ static void uswap_map_anon_page(struct mm_struct *mm, flush_cache_page(vma, addr, pte_pfn(*pte)); set_pte_at(mm, addr, pte, old_pte); inc_mm_counter(mm, MM_ANONPAGES); + add_reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, vma, addr); pte_unmap_unlock(pte, ptl); } @@ -531,6 +534,7 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, } inc_mm_counter(mm, MM_ANONPAGES); + add_reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, dst_vma, dst_addr); set_pte_at(mm, dst_addr, pte, dst_pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6461552c81d78c65451b5c10f0b76d3a795e6301..da31aa73c02bbd7716d280ece3b76cd560a1a0d1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2260,6 +2260,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + reliable_lru_add_batch(zid, lru, -nr_zone_taken[zid]); } } @@ -3867,6 +3868,7 @@ static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk if (lru_gen_is_active(lruvec, gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); + reliable_lru_add_batch(zone, lru, delta); } } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9714327fd0eadd1b8dd58d83443668d0d2acd052..db9ca1d84b745305ada91a3d4bb93cd53c3212b9 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -682,6 +682,7 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, + { "__GFP_RELIABLE", "REL" }, }; static size_t max_gfp_len;