diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 2bfb380e8380a8e4d3a3efd3e13a8f7c07c51cfb..fdff6c4247db1db70fd73f623d1749a76d9154ae 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -160,6 +160,14 @@ library) may want to know the size (in bytes) of a transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size +If CONFIG_THP_NUMA_CONTROL is on, user can control THP migration when +do numa balancing, 0 is default which means keep the default behavior, +writing 1 will disable thp migrate while tasks still have chance to +migrate:: + + echo 0 > /sys/kernel/mm/transparent_hugepage/numa_control + echo 1 > /sys/kernel/mm/transparent_hugepage/numa_control + khugepaged will be automatically started when transparent_hugepage/enabled is set to "always" or "madvise, and it'll be automatically shutdown if it's set to "never". diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cae54a9bf65df9dcd099bfac63777e2b4b6eadc1..bde9ec4af773cb4c9be8211cb840ff4bec66af6d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -216,6 +216,7 @@ config ARM64 select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select HAVE_LIVEPATCH_WO_FTRACE + select THP_NUMA_CONTROL if ARM64_64K_PAGES help ARM 64-bit (AArch64) Linux support. diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index ee59f78da952528b8701f7d771f97be32940068c..5a199f9c551d1125721defab5a7f67e38b5beb7d 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1182,6 +1182,7 @@ CONFIG_MEMORY_RELIABLE=y CONFIG_EXTEND_HUGEPAGE_MAPPING=y CONFIG_MEM_SAMPLING=y CONFIG_NUMABALANCING_MEM_SAMPLING=y +# CONFIG_THP_NUMA_CONTROL is not set # # Data Access Monitoring diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index c24f1fa471a7d788dfd3113b21be1388eb506e99..9cf7ad8c599f0819e1d1c8d9f2fb1e6b6b7af115 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1110,6 +1110,7 @@ CONFIG_ARCH_HAS_PTE_SPECIAL=y CONFIG_MAPPING_DIRTY_HELPERS=y CONFIG_MEMORY_RELIABLE=y # CONFIG_CLEAR_FREELIST_PAGE is not set +# CONFIG_THP_NUMA_CONTROL is not set # # Data Access Monitoring diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index efb370e79ac3a7a215d5722e93bbbc35ce08d7eb..d9dde313d26766f99fa82aa191c6b023137ee1f0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -498,6 +498,19 @@ static inline unsigned long thp_size(struct page *page) return PAGE_SIZE << thp_order(page); } +#ifdef CONFIG_THP_NUMA_CONTROL +#define THP_DISABLE_NUMA_MIGRATE 1 +extern unsigned long thp_numa_control; +static inline bool thp_numa_migrate_disabled(void) +{ + return thp_numa_control == THP_DISABLE_NUMA_MIGRATE; +} +#else +static inline bool thp_numa_migrate_disabled(void) +{ + return false; +} +#endif /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h index 5c168bc60862ddaa30b10f9321f1508974f72d27..6978c11d549950785d748e50e625dcd9f33c99d8 100644 --- a/include/linux/mem_sampling.h +++ b/include/linux/mem_sampling.h @@ -105,4 +105,17 @@ static inline int arm_spe_enabled(void) return 0; } #endif /* CONFIG_ARM_SPE_MEM_SAMPLING */ + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +static inline bool numa_affinity_sampling_enabled(void) +{ + return static_branch_unlikely(&sched_numabalancing_mem_sampling); +} +#else +static inline bool numa_affinity_sampling_enabled(void) +{ + return false; +} +#endif + #endif /* __MEM_SAMPLING_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f56800b17da3ac441351ed67dd5b3452e00a360..0e47766bc5910890410bc4f70c21619680a07863 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1423,6 +1423,20 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +static inline bool in_early_stage(struct task_struct *p, int early_seq) +{ + /* + * For sampling based autonuma, numa_scan_seq never update. Currently, + * just skip here to avoid false migrate. In the future, the real + * lifetime judgment can be implemented if the workloads are very + * sensitive to the starting stage of the process. + */ + if (numa_affinity_sampling_enabled()) + return false; + + return p->numa_scan_seq <= early_seq; +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1439,7 +1453,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || in_early_stage(p, 4)) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -2391,6 +2405,8 @@ static void task_numa_placement(struct task_struct *p) spinlock_t *group_lock = NULL; struct numa_group *ng; + if (numa_affinity_sampling_enabled()) + goto not_scan; /* * The p->mm->numa_scan_seq field gets updated without * exclusive access. Use READ_ONCE() here to ensure @@ -2402,6 +2418,7 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); +not_scan: total_faults = p->numa_faults_locality[0] + p->numa_faults_locality[1]; runtime = numa_get_avg_runtime(p, &period); @@ -2968,16 +2985,13 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) struct callback_head *work = &curr->numa_work; u64 period, now; -#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING /* - * If we are using access hints from hardware (like using - * SPE), don't scan the address space. - * Note that currently PMD-level page migration is not - * supported. + * numa affinity use hardware sampling to get numa info(like using + * SPE for ARM64), no need to scan the address space anymore. */ - if (static_branch_unlikely(&sched_numabalancing_mem_sampling)) + if (numa_affinity_sampling_enabled()) return; -#endif + /* * We don't care about NUMA placement if we don't have memory. */ diff --git a/mm/Kconfig b/mm/Kconfig index ccbad233f2b1e73726b6571e6affdd85c0058a01..cc43f5124cb389e1687a4c2f4a2083617c2c1926 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1038,6 +1038,16 @@ config NUMABALANCING_MEM_SAMPLING if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. +config THP_NUMA_CONTROL + bool "Control THP migration when numa balancing" + depends on NUMA_BALANCING && TRANSPARENT_HUGEPAGE + default n + help + Sometimes migrate THP is not beneficial, for example, when 64K page + size is set on ARM64, THP will be 512M, migration will be expensive. + This featrue add a switch to control the behavior of THP migration + when do numa balancing. + source "mm/damon/Kconfig" endmenu diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e41be42456673f90d17c5d78a2609913ddca14eb..2825c5390fe9159b91a9ea89d0c49ef947b925bc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -316,6 +316,36 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj, static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); +#ifdef CONFIG_THP_NUMA_CONTROL +unsigned long thp_numa_control __read_mostly; + +static ssize_t numa_control_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", READ_ONCE(thp_numa_control)); +} + +static ssize_t numa_control_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > THP_DISABLE_NUMA_MIGRATE) + return -EINVAL; + + WRITE_ONCE(thp_numa_control, value); + + return count; +} + +static struct kobj_attribute numa_control_attr = + __ATTR(numa_control, 0644, numa_control_show, numa_control_store); +#endif + static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, @@ -323,6 +353,9 @@ static struct attribute *hugepage_attr[] = { &hpage_pmd_size_attr.attr, #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, +#endif +#ifdef CONFIG_THP_NUMA_CONTROL + &numa_control_attr.attr, #endif NULL, }; diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c index 1d8a831be531d473317db6fc1c05b494bdd541d6..5bff1221247183dabe0ca787b9297501f9f4fc15 100644 --- a/mm/mem_sampling.c +++ b/mm/mem_sampling.c @@ -369,7 +369,6 @@ static void set_numabalancing_mem_sampling_state(bool enabled) } } -#ifdef CONFIG_PROC_SYSCTL int sysctl_numabalancing_mem_sampling(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -391,7 +390,6 @@ int sysctl_numabalancing_mem_sampling(struct ctl_table *table, int write, return err; } -#endif #else static inline void set_numabalancing_mem_sampling_state(bool enabled) { @@ -423,7 +421,6 @@ static void set_mem_sampling_state(bool enabled) set_numabalancing_mem_sampling_state(enabled); } -#ifdef CONFIG_PROC_SYSCTL static int sysctl_mem_sampling_enable(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -443,7 +440,6 @@ static int sysctl_mem_sampling_enable(struct ctl_table *table, int write, set_mem_sampling_state(state); return err; } -#endif static struct ctl_table ctl_table[] = { { diff --git a/mm/migrate.c b/mm/migrate.c index 857c15e43497df256aa9efc4a87b941dc4e805c0..cff5e11437d904aa87a6504bf94ce39e4e2e3912 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2161,6 +2161,9 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, */ compound = PageTransHuge(page); + if (compound && thp_numa_migrate_disabled()) + return 0; + if (compound) new = alloc_misplaced_dst_page_thp; else