diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 34061d75a0d21be9c981d724c4e05c90b4e1000d..745897d5f4d6e372ad718a84b6fbceade12c7c3f 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -162,6 +162,8 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +CONFIG_SCHED_TASK_RELATIONSHIP=y +CONFIG_QOS_SCHED_NUMA_ICON=y CONFIG_QOS_SCHED_SMART_GRID=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y @@ -234,7 +236,7 @@ CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_BASE_RELATIVE=y # CONFIG_BPF_LSM is not set -# CONFIG_BPF_SCHED is not set +CONFIG_BPF_SCHED=y CONFIG_BPF_SYSCALL=y CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 1835f38f2947ccbf8c017d1d72656c378fa04c54..3c9d3d4e3964c9316db1ee4b1dd00dfcd15f28e6 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -167,6 +167,8 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +# CONFIG_SCHED_TASK_RELATIONSHIP is not set +# CONFIG_QOS_SCHED_NUMA_ICON is not set # CONFIG_QOS_SCHED_SMART_GRID is not set CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y diff --git a/fs/exec.c b/fs/exec.c index 981b3ac90c44e66de934ceefca85c73e0e910fc0..792d62632e92aece555f19f5622a3493c712db03 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm, rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); + task_relationship_free(current, true); return retval; out: diff --git a/include/linux/sched.h b/include/linux/sched.h index b4ab407cab37968510b38e8741c478b79dff4503..fa83018137ce3bf73565d30bb4b25e5335dacea7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1437,11 +1438,15 @@ struct task_struct { KABI_USE(7, void *pf_io_worker) #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__) KABI_USE(8, cpumask_t *prefer_cpus) - KABI_USE(9, const cpumask_t *select_cpus) #else KABI_RESERVE(8) +#endif +#if defined(CONFIG_TASK_PLACEMENT_BY_CPU_RANGE) && !defined(__GENKSYMS__) + KABI_USE(9, const cpumask_t *select_cpus) +#else KABI_RESERVE(9) #endif + #if (defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)) && defined(CONFIG_X86) KABI_USE(10, unsigned int sequential_io) KABI_USE(11, unsigned int sequential_io_avg) @@ -1464,7 +1469,11 @@ struct task_struct { #else KABI_RESERVE(13) #endif +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && !defined(__GENKSYMS__) + KABI_USE(14, struct task_relationship *rship) +#else KABI_RESERVE(14) +#endif KABI_RESERVE(15) KABI_RESERVE(16) KABI_AUX_PTR(task_struct) @@ -2351,6 +2360,21 @@ struct bpf_sched_cpu_stats { KABI_RESERVE(4) }; +struct bpf_node_stats { + unsigned long util; + unsigned long compute_capacity; + unsigned int weight; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) +}; + struct cpumask_op_args { unsigned int op_type; void *arg1; @@ -2374,6 +2398,28 @@ enum cpumask_op_type { CPUMASK_CPULIST_PARSE }; +enum nodemask_op_type { + NODEMASK_EMPTY, + NODEMASK_NODE_ISSET, + NODEMASK_NODES_CLEAR, + NODEMASK_NODE_SET, + NODEMASK_NODE_CLEAR, + NODEMASK_NODELIST_PARSE, + NODEMASK_TO_CPUMASK, + NODEMASK_NODES_ANDNOT, + NODEMASK_NODES_AND, + NODEMASK_NODES_OR, + NODEMASK_WEIGHT, + NODEMASK_ONLINE +}; + +struct nodemask_op_args { + enum nodemask_op_type op_type; + void *arg1; + void *arg2; + void *arg3; +}; + struct sched_migrate_ctx { struct task_struct *task; struct cpumask *select_idle_mask; @@ -2402,5 +2448,15 @@ struct sched_affine_ctx { KABI_RESERVE(3) KABI_RESERVE(4) }; + +struct sched_migrate_node { + int src_cpu; + int dst_cpu; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; #endif #endif diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h new file mode 100644 index 0000000000000000000000000000000000000000..43aa3f9706d40521c6147947c3a774adf00186ea --- /dev/null +++ b/include/linux/sched/relationship.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_RELATIONSHIP_H +#define _LINUX_SCHED_RELATIONSHIP_H + +#include +#include +#include +#include + +#define FAULT_NODES_MAX 4 + +struct task_struct; +struct rq; + +#ifdef CONFIG_SCHED_DEBUG +struct seq_file; +#endif + +struct fault_array_info { + int nid; + unsigned long val; +}; + +struct relationship_comm { + int nr_tasks; + int gid; + nodemask_t preferred_node; +}; + +struct bpf_net_relationship { + struct relationship_comm comm; + unsigned long grp_rxtx_bytes; + unsigned long grp_remote_rxtx_bytes; +}; + +struct bpf_mm_relationship { + struct relationship_comm comm; + unsigned long grp_total_faults; + struct fault_array_info grp_faults_ordered[FAULT_NODES_MAX]; + struct fault_array_info grp_score_ordered[FAULT_NODES_MAX]; +}; + +struct bpf_relationship_get_args { + struct bpf_mm_relationship mm; + struct bpf_net_relationship net; +}; + +struct bpf_relationship_set_args { + nodemask_t preferred_node; +}; + +struct relationship_hdr { + refcount_t refcount; + spinlock_t lock; + int nr_tasks; + int gid; + nodemask_t preferred_nid; +}; + +enum net_req_type { + NET_RS_TYPE_INVALID = 0, + NET_RS_TYPE_LOCAL, + NET_RS_TYPE_RX, + NET_RS_TYPE_TX, + NET_RS_TYPE_MAX +}; + +struct net_relationship_req { + enum net_req_type net_rship_type; + pid_t rx_pid; + pid_t tx_pid; + int nic_nid; + int rx_dev_idx; + int rx_dev_queue_idx; + u64 rx_dev_netns_cookie; + unsigned long rxtx_bytes; + + /* reserved */ + unsigned long rxtx_cnt; +}; + +struct net_relationship_callback { + struct callback_head twork; + atomic_t active; + pid_t src_pid; + struct net_relationship_req req; +}; + +struct net_group { + struct rcu_head rcu; + struct relationship_hdr hdr; + unsigned long rxtx_bytes; + + /* reserved */ + unsigned long rxtx_cnt; +}; + +struct numa_fault_ext { + struct fault_array_info faults_ordered[FAULT_NODES_MAX]; +}; + +struct task_relationship { + /* network relationship */ + struct net_group __rcu *net_group; + spinlock_t net_lock; + int nic_nid; + int rx_dev_idx; + int rx_dev_queue_idx; + unsigned long rx_dev_netns_cookie; + unsigned long rxtx_remote_bytes; + unsigned long rxtx_remote_update_next; + unsigned long rxtx_remote_buffer; + unsigned long rxtx_bytes; + unsigned long rxtx_buffer; + unsigned long rxtx_update_next; + struct net_relationship_callback cb; + + /* extras numa fault data */ + struct numa_fault_ext faults; + +#ifdef CONFIG_NUMA_BALANCING + /* preferred nodes adjust */ + u64 node_stamp; + struct callback_head node_work; +#endif +}; + +#ifdef CONFIG_BPF_SCHED +struct sched_preferred_node_ctx { + struct task_struct *tsk; + nodemask_t preferred_node; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; +#endif + +extern void task_relationship_enable(void); +extern void task_relationship_disable(void); + +#ifdef CONFIG_SCHED_DEBUG +extern void sched_show_relationship(struct task_struct *p, struct seq_file *m); +#endif + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +extern int sched_relationship_fork(struct task_struct *p); +extern void sched_relationship_free(struct task_struct *p); +void task_relationship_free(struct task_struct *tsk, bool reset); +extern bool task_relationship_supported(struct task_struct *tsk); +extern int sched_net_relationship_submit(struct net_relationship_req *req); +extern void +sctl_sched_get_net_relationship(struct task_struct *tsk, + struct sctl_net_relationship_info *info); +extern void +sctl_sched_get_mem_relationship(struct task_struct *tsk, + struct sctl_mem_relationship_info *info); +extern void sched_get_mm_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args); +extern void sched_get_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args); +extern void numa_faults_update_and_sort(int nid, int new, + struct fault_array_info *stats); +extern void task_tick_relationship(struct rq *rq, struct task_struct *curr); + +extern void task_preferred_node_work(struct callback_head *work); +extern void +sched_set_curr_preferred_node(struct bpf_relationship_set_args *args); + +DECLARE_STATIC_KEY_FALSE(__relationship_switch); +static inline bool task_relationship_used(void) +{ + return static_branch_unlikely(&__relationship_switch); +} +#else +static inline bool task_relationship_used(void) +{ + return false; +} + +static inline int sched_relationship_fork(struct task_struct *p) +{ + return 0; +} + +static inline void sched_relationship_free(struct task_struct *p) {} + +static inline void +task_relationship_free(struct task_struct *tsk, bool reset) {} + +static inline int +sched_net_relationship_submit(struct net_relationship_req *req) +{ + return 0; +} + +static inline void +task_tick_relationship(struct rq *rq, struct task_struct *curr) {} +#endif + +#endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 818b1244a018f55d774b8929bf4de9c7b0b80224..0a871f728c856ea88c5dabf9c3742d6db947e93e 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -10,3 +10,7 @@ BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_stru BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, + struct sched_migrate_node *migrate_node) +BPF_SCHED_HOOK(void, (void) 0, cfs_change_preferred_node, + struct sched_preferred_node_ctx *ctx) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b11202c3439e3e5740bcb28e057c79fe7464ac9..b87934003c407563770aa110aa1b7988f3b22cc4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3874,6 +3874,52 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * yes 1, no 0. + * + * int bpf_nodemask_op(struct nodemask_op_args *op, int len) + * Description + * A series of nodemask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **NODEMASK_EMPTY** + * nodes_empty(op->arg1) returned. + * **NODEMASK_NODE_ISSET** + * node_isset(op->arg1, op->arg2) returned + * **NODEMASK_NODES_CLEAR** + * 0 returned + * **NODEMASK_NODE_CLEAR** + * unset op->arg1 from op->arg2, 0 returned + * **NODEMASK_NODE_SET** + * set op->arg1 to op->arg2, 0 returned + * **NODEMASK_WEIGHT** + * nodes_weight(op->arg1) returned + * **NODEMASK_NODELIST_PARSE** + * str *op->arg1* to nodemask_t *op->arg2*, + * 0 on success, or a negative error in case of failure. + * **NODEMASK_TO_CPUMASK** + * nodemask_t *arg1* to cpumask_t *op->arg2*, 0 returned. + * **NODEMASK_ONLINE** + * set online nodes to nodemask_t *op->arg1*, 0 returned. + * Return + * View above. + * + * int bpf_get_task_relationship_stats(struct task_struct *tsk, struct bpf_map *map, struct bpf_relationship_get_args *stats) + * Description + * get relationship statistics of *tsk* and store in *stats*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_curr_preferred_node(struct bpf_relationship_set_args *args, int len) + * Description + * set current task preferred node. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len) + * Description + * get resource statistics of *nid* and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4046,6 +4092,10 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(nodemask_op), \ + FN(get_task_relationship_stats),\ + FN(sched_set_curr_preferred_node),\ + FN(get_node_stats), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/include/uapi/linux/sched_ctrl.h b/include/uapi/linux/sched_ctrl.h new file mode 100644 index 0000000000000000000000000000000000000000..13a4eb182d5e3a37036dce667e53242ba2f0b44d --- /dev/null +++ b/include/uapi/linux/sched_ctrl.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_SCHED_CTRL_H +#define _LINUX_SCHED_CTRL_H + +#include + + +#define SCTL_IOC_MAGIC 'X' + +/* get task relationship */ +#define SCTL_GET_RSHIP \ + _IOR(SCTL_IOC_MAGIC, 0, struct sctl_get_relationship_args) + +#define SCTL_IOC_MAXNR 1 + +#define SCTL_MAX_NUMNODES 16 +#define SCTL_STR_MAX 64 +#define NR_TASK_FAULTS_TYPE 2 + +#define NO_RSHIP (-1) + +struct grp_hdr { + int gid; + char preferred_nid[SCTL_STR_MAX]; + int nr_tasks; +}; + +struct sctl_net_relationship_info { + int valid; + struct grp_hdr grp_hdr; + int nic_nid; + int rx_dev_idx; + int rx_dev_queue_idx; + unsigned long rx_dev_netns_cookie; + unsigned long rxtx_remote_bytes; + unsigned long rxtx_bytes; + unsigned long grp_rxtx_bytes; +}; + +struct sctl_mem_relationship_info { + int valid; + struct grp_hdr grp_hdr; + int nodes_num; + unsigned long total_faults; + unsigned long grp_total_faults; + unsigned long faults[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; + unsigned long faults_cpu[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; + unsigned long grp_faults[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; + unsigned long grp_faults_cpu[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; +}; + +struct sctl_get_relationship_args { + int tid; + struct sctl_net_relationship_info nrsi; + struct sctl_mem_relationship_info mrsi; +}; +#endif /* _LINUX_SCHED_CTRL_H */ diff --git a/init/Kconfig b/init/Kconfig index 69bd400daeb3b03e0c776302a6964e0ff0fe41a5..b722b7a887c1f63af8af7fcd37720b59df5d91e4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1066,9 +1066,13 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED +config TASK_PLACEMENT_BY_CPU_RANGE + bool "variable cpu range for task placement" + config QOS_SCHED_DYNAMIC_AFFINITY bool "qos dynamic affinity" depends on CPUSETS + select TASK_PLACEMENT_BY_CPU_RANGE default n help This feature lets you allocate preferred cpus to taskgroup. If enabled, @@ -1076,6 +1080,29 @@ config QOS_SCHED_DYNAMIC_AFFINITY of taskgroup is below threshold setted, otherwise make taskgroup to use cpus allowed. +config SCHED_TASK_RELATIONSHIP + bool "task relationship" + depends on NUMA_BALANCING + default n + help + This feature enables the scheduler to identify tasks relationship by + page fault, SPE, socket and other IPC method. + + If in doubt, say N. + +config QOS_SCHED_NUMA_ICON + bool "numa aware schedule" + depends on BPF_SCHED + depends on SCHED_TASK_RELATIONSHIP + default n + help + This feature provides the NUMA Isolation and Consolidationthe + Mechanisms based on ebpf and task relationship. If enabled, scheduler + places related tasks on same numa node when the node has spare + resource. + + If in doubt, say N. + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED @@ -1838,6 +1865,7 @@ config BPF_SCHED bool "SCHED Instrumentation with BPF" depends on BPF_EVENTS depends on BPF_SYSCALL + select TASK_PLACEMENT_BY_CPU_RANGE help Enables instrumentation of the sched hooks with eBPF programs for implementing dynamic scheduling policies. diff --git a/init/init_task.c b/init/init_task.c index fa8838c2c203b694b0ef6e9c4c2e83050d9e3e39..3b846f8223d96663354155c05eb6052257a580c0 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -217,6 +217,9 @@ struct task_struct init_task #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY .prefer_cpus = NULL, #endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + .rship = NULL, +#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/fork.c b/kernel/fork.c index 079b718131b064eb2cc875d8ce1d45f5ce2f7969..12db99751381f2a155e4902bfede63231b182c41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -476,6 +476,8 @@ void free_task(struct task_struct *tsk) #ifdef CONFIG_QOS_SCHED_SMART_GRID sched_grid_qos_free(tsk); #endif + if (task_relationship_used()) + sched_relationship_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -748,6 +750,7 @@ void __put_task_struct(struct task_struct *tsk) io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); + task_relationship_free(tsk, false); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -949,6 +952,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->prefer_cpus = NULL; #endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + tsk->rship = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -2102,6 +2109,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; #endif + if (task_relationship_used()) { + retval = sched_relationship_fork(p); + if (retval) + goto bad_fork_cleanup_count; + } + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a6fe0ee09917a9a81a9ecaf2e73f16bd616400fd..ff9ff2c17f79d881430f47b63489b3f6a73f6f5c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,3 +40,5 @@ obj-$(CONFIG_SCHED_CORE) += core_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_topology.o obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ +obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o relationship_ioctl.o +obj-$(CONFIG_QOS_SCHED_NUMA_ICON) += numa_icon.o diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 220ba83fc5f4a0fae183db3d32b320ad9f30ab66..3e14d1fa911ee2abcee7242e08a4a7de7970e3b8 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -260,6 +260,75 @@ static const struct bpf_func_proto bpf_cpumask_op_proto = { .arg2_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_nodemask_op, struct nodemask_op_args *, op, int, len) +{ + struct cpumask *cpumask; + nodemask_t mask; + int nid; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case NODEMASK_EMPTY: + mask = *(nodemask_t *)op->arg1; + return nodes_empty(mask); + case NODEMASK_NODE_ISSET: + mask = *(nodemask_t *)op->arg2; + return node_isset(*(int *)op->arg1, mask); + case NODEMASK_NODES_CLEAR: + __nodes_clear((nodemask_t *)op->arg1, MAX_NUMNODES); + break; + case NODEMASK_NODE_CLEAR: + __node_clear(*(int *)op->arg1, (nodemask_t *)op->arg2); + break; + case NODEMASK_NODE_SET: + __node_set(*(int *)op->arg1, (nodemask_t *)op->arg2); + break; + case NODEMASK_NODES_AND: + __nodes_and((nodemask_t *)op->arg1, (nodemask_t *)op->arg2, + (nodemask_t *)op->arg3, MAX_NUMNODES); + break; + case NODEMASK_NODES_ANDNOT: + __nodes_andnot((nodemask_t *)op->arg1, (nodemask_t *)op->arg2, + (nodemask_t *)op->arg3, MAX_NUMNODES); + break; + case NODEMASK_NODES_OR: + __nodes_or((nodemask_t *)op->arg1, (nodemask_t *)op->arg2, + (nodemask_t *)op->arg3, MAX_NUMNODES); + break; + case NODEMASK_WEIGHT: + mask = *(nodemask_t *)op->arg1; + return nodes_weight(mask); + case NODEMASK_NODELIST_PARSE: + return __nodelist_parse((const char *)op->arg1, + (nodemask_t *)op->arg2, MAX_NUMNODES); + case NODEMASK_TO_CPUMASK: + mask = *(nodemask_t *)op->arg1; + cpumask = (struct cpumask *)op->arg2; + cpumask_clear(cpumask); + for_each_node_mask(nid, mask) { + cpumask_or(cpumask, cpumask, cpumask_of_node(nid)); + } + break; + case NODEMASK_ONLINE: + *(nodemask_t *)op->arg1 = node_online_map; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_nodemask_op_proto = { + .func = bpf_nodemask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +}; + BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) { if ((unsigned int)src_cpu >= nr_cpu_ids || @@ -277,6 +346,74 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = { .arg2_type = ARG_ANYTHING, }; +#ifdef CONFIG_QOS_SCHED_NUMA_ICON +BPF_CALL_3(bpf_get_node_stats, int, nid, + struct bpf_node_stats *, ctx, + int, len) +{ + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)nid >= nr_node_ids) + return -EINVAL; + + sched_get_node_load(nid, ctx); + return 0; +} + +const struct bpf_func_proto bpf_get_node_stats_proto = { + .func = bpf_get_node_stats, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; +#endif + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +BPF_CALL_3(bpf_get_task_relationship_stats, struct task_struct *, tsk, + struct bpf_map *, map, struct bpf_relationship_get_args *, args) +{ + if (!task_relationship_supported(tsk)) + return -EPERM; + + if (!args) + return -EINVAL; + + sched_get_relationship(tsk, args); + return 0; +} + +const struct bpf_func_proto bpf_get_task_relationship_stats_proto = { + .func = bpf_get_task_relationship_stats, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, +}; + +BPF_CALL_2(bpf_sched_set_curr_preferred_node, + struct bpf_relationship_set_args *, args, int, len) +{ + if (!args || len != sizeof(*args)) + return -EINVAL; + + sched_set_curr_preferred_node(args); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_curr_preferred_node_proto = { + .func = bpf_sched_set_curr_preferred_node, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, +}; +#endif + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -299,6 +436,18 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cpumask_op_proto; case BPF_FUNC_cpus_share_cache: return &bpf_cpus_share_cache_proto; + case BPF_FUNC_nodemask_op: + return &bpf_nodemask_op_proto; +#ifdef CONFIG_QOS_SCHED_NUMA_ICON + case BPF_FUNC_get_node_stats: + return &bpf_get_node_stats_proto; +#endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + case BPF_FUNC_get_task_relationship_stats: + return &bpf_get_task_relationship_stats_proto; + case BPF_FUNC_sched_set_curr_preferred_node: + return &bpf_sched_set_curr_preferred_node_proto; +#endif default: return bpf_base_func_proto(func_id); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa71c7c5119641fbd162e0564e85059deb4e8027..d034294c59ceb4a29be7df9caac49c42e6f198fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8225,6 +8225,9 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#ifdef CONFIG_BPF_SCHED +DECLARE_PER_CPU(cpumask_var_t, select_cpu_mask); +#endif void __init sched_init(void) { @@ -8278,6 +8281,10 @@ void __init sched_init(void) cpumask_size(), GFP_KERNEL, cpu_to_node(i)); per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); +#ifdef CONFIG_BPF_SCHED + per_cpu(select_cpu_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); +#endif } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 00f01518bbddc5935dd6029d686266be439ae72f..5233ba9fdc697d776246bee1b96e81d207e045fd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1040,6 +1040,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, } sched_show_numa(p, m); + + sched_show_relationship(p, m); } void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f39e7547523c78bda1af34d28df053b1670296d3..404358af80c74637932f0c6a84f15302babee513 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1086,6 +1086,13 @@ struct numa_group { struct rcu_head rcu; unsigned long total_faults; unsigned long max_faults_cpu; +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + struct fault_array_info score_ordered[FAULT_NODES_MAX]; + struct fault_array_info faults_ordered[FAULT_NODES_MAX]; + nodemask_t preferred_nid; + u64 node_stamp; + u64 nodes_switch_cnt; +#endif /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -2279,6 +2286,9 @@ static int preferred_group_nid(struct task_struct *p, int nid) { nodemask_t nodes; int dist; +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + struct numa_group *ng; +#endif /* Direct connections between all NUMA nodes. */ if (sched_numa_topology_type == NUMA_DIRECT) @@ -2301,7 +2311,19 @@ static int preferred_group_nid(struct task_struct *p, int nid) max_score = score; max_node = node; } +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + ng = deref_curr_numa_group(p); + if (ng) { + spin_lock_irq(&ng->lock); + numa_faults_update_and_sort(node, score, + ng->score_ordered); + spin_unlock_irq(&ng->lock); + } + } +#endif } + return max_node; } @@ -2451,6 +2473,17 @@ static void task_numa_placement(struct task_struct *p) max_faults = group_faults; max_nid = nid; } + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + numa_faults_update_and_sort(nid, faults, + p->rship->faults.faults_ordered); + + if (ng) + numa_faults_update_and_sort(nid, group_faults, + ng->faults_ordered); + } +#endif } if (ng) { @@ -2512,6 +2545,17 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + grp->preferred_nid = NODE_MASK_NONE; + grp->node_stamp = jiffies; + for (i = 0; i < FAULT_NODES_MAX; i++) { + grp->faults_ordered[i].nid = -1; + grp->score_ordered[i].nid = -1; + } + } +#endif } rcu_read_lock(); @@ -2623,6 +2667,15 @@ void task_numa_free(struct task_struct *p, bool final) p->total_numa_faults = 0; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) numa_faults[i] = 0; + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + for (i = 0; i < FAULT_NODES_MAX; i++) { + p->rship->faults.faults_ordered[i].nid = -1; + p->rship->faults.faults_ordered[i].val = 0; + } + } +#endif } } @@ -2992,6 +3045,91 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +void sctl_sched_get_mem_relationship(struct task_struct *tsk, + struct sctl_mem_relationship_info *info) +{ +#ifdef CONFIG_NUMA_BALANCING + struct task_relationship *rship = tsk->rship; + int nid, priv, cpu_idx, mem_idx; + struct numa_group *grp; + + info->valid = false; + if (unlikely(!rship) || !tsk->numa_faults) + return; + + memset(info, 0, sizeof(*info)); + info->valid = true; + info->nodes_num = nr_node_ids; + info->grp_hdr.gid = NO_RSHIP; + info->total_faults = tsk->total_numa_faults; + + rcu_read_lock(); + + grp = rcu_dereference(tsk->numa_group); + if (grp) { + info->grp_hdr.gid = grp->gid; + info->grp_hdr.nr_tasks = grp->nr_tasks; + snprintf(info->grp_hdr.preferred_nid, SCTL_STR_MAX, "%*pbl", + nodemask_pr_args(&grp->preferred_nid)); + } + + for_each_online_node(nid) { + if (nid >= SCTL_MAX_NUMNODES) + break; + + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + info->faults[nid][priv] = tsk->numa_faults[mem_idx]; + info->faults_cpu[nid][priv] = tsk->numa_faults[cpu_idx]; + + if (grp) { + info->grp_faults[nid][priv] = grp->faults[mem_idx]; + info->grp_faults_cpu[nid][priv] = grp->faults_cpu[mem_idx]; + info->grp_total_faults = grp->total_faults; + } + } + } + + rcu_read_unlock(); +#endif +} + +#ifdef CONFIG_BPF_SCHED +void sched_get_mm_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args) +{ +#ifdef CONFIG_NUMA_BALANCING + struct numa_group *grp; + + grp = rcu_dereference(tsk->numa_group); + if (grp) { + args->mm.comm.gid = grp->gid; + args->mm.comm.nr_tasks = grp->nr_tasks; + args->mm.grp_total_faults = grp->total_faults; + args->mm.comm.preferred_node = grp->preferred_nid; + memcpy(args->mm.grp_faults_ordered, grp->faults_ordered, + sizeof(args->mm.grp_faults_ordered)); + memcpy(args->mm.grp_score_ordered, grp->score_ordered, + sizeof(args->mm.grp_score_ordered)); + } +#endif +} + +void sched_set_curr_preferred_node(struct bpf_relationship_set_args *args) +{ +#ifdef CONFIG_NUMA_BALANCING + struct numa_group *grp = rcu_dereference_raw(current->numa_group); + + grp->preferred_nid = args->preferred_node; + schedstat_inc(grp->nodes_switch_cnt); +#endif +} +#endif + +#endif + #ifdef CONFIG_QOS_SCHED_PRIO_LB static __always_inline void adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *), @@ -3816,6 +3954,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq_util_change(cfs_rq, 0); + numa_load_change(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); } @@ -3846,6 +3986,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq_util_change(cfs_rq, 0); + numa_load_change(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); } @@ -3886,6 +4028,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); + numa_load_change(cfs_rq); if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq); @@ -6578,6 +6721,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#ifdef CONFIG_BPF_SCHED +DEFINE_PER_CPU(cpumask_var_t, select_cpu_mask); +#endif #ifdef CONFIG_NO_HZ_COMMON @@ -6838,7 +6984,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { #else for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { @@ -6889,7 +7035,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) #else if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) @@ -7020,7 +7166,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->select_cpus)) { #else if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { @@ -7080,7 +7226,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); #else cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); @@ -7248,7 +7394,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_test_cpu(target, p->select_cpus) && #endif asym_fits_capacity(task_util, target)) { @@ -7261,7 +7407,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_test_cpu(prev, p->select_cpus) && #endif asym_fits_capacity(task_util, prev)) { @@ -7297,7 +7443,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && #else cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && @@ -7897,7 +8043,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); #ifdef CONFIG_BPF_SCHED struct sched_migrate_ctx ctx; - cpumask_t *cpus_prev = NULL; cpumask_t *cpus; int ret; #endif @@ -7912,8 +8057,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ lockdep_assert_held(&p->pi_lock); -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE p->select_cpus = p->cpus_ptr; +#endif + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (dynamic_affinity_used() || smart_grid_used()) set_task_select_cpus(p, &idlest_cpu, sd_flag); #endif @@ -7928,7 +8076,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus); #else want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); @@ -7945,18 +8093,18 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f ctx.wake_flags = wake_flags; ctx.want_affine = want_affine; ctx.sd_flag = sd_flag; - ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + ctx.select_idle_mask = + this_cpu_cpumask_var_ptr(select_cpu_mask); ret = bpf_sched_cfs_select_rq(&ctx); if (ret >= 0) { rcu_read_unlock(); return ret; } else if (ret != -1) { - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - if (cpumask_subset(cpus, p->cpus_ptr) && + cpus = this_cpu_cpumask_var_ptr(select_cpu_mask); + if (cpumask_subset(cpus, p->select_cpus) && !cpumask_empty(cpus)) { - cpus_prev = (void *)p->cpus_ptr; - p->cpus_ptr = cpus; + p->select_cpus = cpus; } } } @@ -7969,7 +8117,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE new_cpu = cpu; if (cpu != prev_cpu && cpumask_test_cpu(prev_cpu, p->select_cpus)) @@ -8004,11 +8152,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (bpf_sched_enabled()) { ctx.new_cpu = new_cpu; ret = bpf_sched_cfs_select_rq_exit(&ctx); - if (ret >= 0) - new_cpu = ret; - - if (cpus_prev) - p->cpus_ptr = cpus_prev; + if (ret > 0 && ret <= nr_cpu_ids) + new_cpu = ret - 1; } #endif @@ -9486,9 +9631,23 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_node migrate_node; + int ret; +#endif lockdep_assert_rq_held(env->src_rq); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + migrate_node.src_cpu = env->src_cpu; + migrate_node.dst_cpu = env->dst_cpu; + ret = bpf_sched_cfs_can_migrate_task(p, &migrate_node); + if (ret > 0) + return ret - 1; + } +#endif + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -10845,7 +11004,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; /* Skip over this group if it has no CPUs allowed */ -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE if (!cpumask_intersects(sched_group_span(group), p->select_cpus)) #else @@ -13130,6 +13289,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_overutilized_status(task_rq(curr)); task_tick_core(rq, curr); + + task_tick_relationship(rq, curr); + + update_numa_capacity(rq); } /* @@ -13691,7 +13854,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; @@ -13706,8 +13869,99 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ + +void sched_show_relationship(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + struct net_group *net_grp; + struct numa_group *ng; + int node; + + if (!task_relationship_used()) + return; + + rcu_read_lock(); + + ng = rcu_dereference(p->numa_group); + if (ng) { + seq_printf(m, "numa group preferred nid %*pbl switch_cnt %llu\n", + nodemask_pr_args(&ng->preferred_nid), + ng->nodes_switch_cnt); + } + + net_grp = rcu_dereference(p->rship->net_group); + if (net_grp) { + seq_printf(m, "net group gid %d preferred nid %*pbl\n", + net_grp->hdr.gid, + nodemask_pr_args(&net_grp->hdr.preferred_nid)); + } + + rcu_read_unlock(); + + for_each_online_node(node) { + print_node_load_info(m, node); + } +#endif +} #endif /* CONFIG_SCHED_DEBUG */ +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +void task_preferred_node_work(struct callback_head *work) +{ +#ifdef CONFIG_NUMA_BALANCING + struct task_struct *curr = current; + struct numa_group *numa_grp; +#ifdef CONFIG_BPF_SCHED + struct sched_preferred_node_ctx ctx = {0}; +#endif + + work->next = work; + +#ifdef CONFIG_BPF_SCHED + numa_grp = deref_curr_numa_group(curr); + if (numa_grp) { + + spin_lock_irq(&numa_grp->lock); + ctx.tsk = curr; + ctx.preferred_node = numa_grp->preferred_nid; + bpf_sched_cfs_change_preferred_node(&ctx); + spin_unlock_irq(&numa_grp->lock); + } +#endif +#endif +} + +void task_tick_relationship(struct rq *rq, struct task_struct *curr) +{ +#ifdef CONFIG_NUMA_BALANCING + struct callback_head *work = &curr->rship->node_work; + struct numa_group *numa_grp; + + if (!task_relationship_supported(curr)) + return; + + if (work->next != work) + return; + + numa_grp = deref_curr_numa_group(curr); + if (!numa_grp || numa_grp->nr_tasks <= 1) + return; + + spin_lock(&numa_grp->lock); + + if (time_after(jiffies, + (unsigned long)(numa_grp->node_stamp + msecs_to_jiffies(100)))) { + numa_grp->node_stamp = jiffies; + spin_unlock(&numa_grp->lock); + task_work_add(curr, &curr->rship->node_work, TWA_RESUME); + return; + } + + spin_unlock(&numa_grp->lock); +#endif +} +#endif + __init void init_sched_fair_class(void) { #ifdef CONFIG_QOS_SCHED @@ -13717,6 +13971,8 @@ __init void init_sched_fair_class(void) INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); #endif + init_sched_numa_icon(); + #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/numa_icon.c b/kernel/sched/numa_icon.c new file mode 100644 index 0000000000000000000000000000000000000000..e9825ac7f866e15414352ec68c69567fa217e2ba --- /dev/null +++ b/kernel/sched/numa_icon.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for task numa isolation consolidation + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include "sched.h" + +static bool __sched_numa_icon_switch __initdata; +DEFINE_STATIC_KEY_FALSE(sched_numa_icon_switch); + +struct node_load_info *node_load_ptr; + +static void set_numa_icon_switch(bool enabled) +{ + if (enabled) { + static_branch_enable(&sched_numa_icon_switch); + task_relationship_enable(); + } else { + static_branch_disable(&sched_numa_icon_switch); + task_relationship_disable(); + } +} + +static int __init numa_icon_switch_setup(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_numa_icon_switch() can do it later. + */ + if (!strcmp(str, "enable")) { + __sched_numa_icon_switch = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + __sched_numa_icon_switch = false; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse numa_icon=\n"); + + return ret; +} +__setup("numa_icon=", numa_icon_switch_setup); + +__init void init_sched_numa_icon(void) +{ + int i; + + set_numa_icon_switch(__sched_numa_icon_switch); + + if (!sched_numa_icon_enabled()) + return; + + node_load_ptr = kcalloc(nr_node_ids, sizeof(struct node_load_info), + GFP_KERNEL); + + for (i = 0; i < nr_node_ids; i++) { + raw_spin_lock_init(&node_load_ptr[i].lock); + node_load_ptr[i].util_avg_last = + kcalloc(nr_cpu_ids, sizeof(struct sched_avg), GFP_KERNEL); + } + + for_each_possible_cpu(i) { + node_load_ptr[cpu_to_node(i)].compute_capacity += + SCHED_CAPACITY_SCALE; + } +} + +void print_node_load_info(struct seq_file *m, int node) +{ + if (!sched_numa_icon_enabled()) + return; + + seq_printf(m, "node %d capacity=%lu util_avg=%lu\n", node, + node_load_ptr[node].compute_capacity, + atomic_long_read(&node_load_ptr[node].util_avg)); +} + +void numa_load_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + int nid = cpu_to_node(cpu); + struct sched_avg *avg_old; + long delta; + + if (!sched_numa_icon_enabled()) + return; + + avg_old = &node_load_ptr[nid].util_avg_last[cpu]; + + if (&rq->cfs != cfs_rq) + return; + + delta = cfs_rq->avg.util_avg - avg_old->util_avg; + atomic_long_add(delta, &node_load_ptr[nid].util_avg); + avg_old->util_avg = cfs_rq->avg.util_avg; +} + +void update_numa_capacity(struct rq *rq) +{ + int cpu = cpu_of(rq); + int nid = cpu_to_node(cpu); + unsigned long capacity = 0; + + if (!sched_numa_icon_enabled()) + return; + + if (cpu != cpumask_first(cpumask_of_node(nid))) + return; + + for_each_cpu(cpu, cpumask_of_node(nid)) { + capacity += cpu_rq(cpu)->cpu_capacity; + } + node_load_ptr[nid].compute_capacity = capacity; +} + +#ifdef CONFIG_BPF_SCHED +void sched_get_node_load(int nid, struct bpf_node_stats *ctx) +{ + ctx->util = atomic_long_read(&node_load_ptr[nid].util_avg); + ctx->compute_capacity = node_load_ptr[nid].compute_capacity; + ctx->weight = cpumask_weight(cpumask_of_node(nid)); +} +#endif diff --git a/kernel/sched/numa_icon.h b/kernel/sched/numa_icon.h new file mode 100644 index 0000000000000000000000000000000000000000..adeed53e9f14502e33e83d62852bda9c952ff343 --- /dev/null +++ b/kernel/sched/numa_icon.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NUMA_ICON_H +#include + +struct node_load_info { + raw_spinlock_t lock ____cacheline_aligned; + atomic_long_t util_avg; + unsigned long compute_capacity; + struct sched_avg *util_avg_last; +}; + +#ifdef CONFIG_QOS_SCHED_NUMA_ICON +extern struct static_key_false sched_numa_icon_switch; +static __always_inline bool sched_numa_icon_enabled(void) +{ + return static_branch_unlikely(&sched_numa_icon_switch); +} + +extern void print_node_load_info(struct seq_file *m, int node); +extern __init void init_sched_numa_icon(void); +extern void sched_get_node_load(int nid, struct bpf_node_stats *ctx); +extern void init_node_load(struct rq *rq); +extern void numa_load_change(struct cfs_rq *cfs_rq); +extern void update_numa_capacity(struct rq *rq); + +#else /* !CONFIG_QOS_SCHED_NUMA_ICON */ +static inline void init_sched_numa_icon(void) {} + +static inline void init_node_load(struct rq *rq) {} + +static inline void numa_load_change(struct cfs_rq *cfs_rq) {} + +static inline void update_numa_capacity(struct rq *rq) {} + +static inline void print_node_load_info(struct seq_file *m, int node) {} + +static __always_inline bool sched_numa_icon_enabled(void) +{ + return false; +} +#endif /* CONFIG_QOS_SCHED_NUMA_ICON */ + +#endif diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c new file mode 100644 index 0000000000000000000000000000000000000000..515c913aeb334d66464a7d0147d1d094e4314d77 --- /dev/null +++ b/kernel/sched/relationship.c @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for task relationship aware + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include + +#include "sched.h" + +#define RXTX_BYTES_PERIOD_MS (1000) +#define RXTX_BYTES_DECAY_RATIO (2) + +DEFINE_STATIC_KEY_FALSE(__relationship_switch); + +void task_relationship_enable(void) +{ + static_branch_enable(&__relationship_switch); +} + +void task_relationship_disable(void) +{ + static_branch_disable(&__relationship_switch); +} + +bool task_relationship_supported(struct task_struct *tsk) +{ + if (!task_relationship_used()) + return false; + + if (!tsk->rship || !tsk->mm || + !cpumask_subset(cpu_online_mask, tsk->cpus_ptr) || + !nodes_subset(node_online_map, tsk->mems_allowed) || + get_task_policy(tsk)->mode == MPOL_BIND || + get_task_policy(tsk)->mode == MPOL_INTERLEAVE) + return false; + + return true; +} + +static inline int get_net_group(struct net_group *grp) +{ + return refcount_inc_not_zero(&grp->hdr.refcount); +} + +static inline void put_net_group(struct net_group *grp) +{ + if (refcount_dec_and_test(&grp->hdr.refcount)) + kfree_rcu(grp, rcu); +} + +static inline void put_task_net_group(struct task_struct *tsk, bool reset) +{ + struct net_group *grp; + unsigned long flags; + + spin_lock_irqsave(&tsk->rship->net_lock, flags); + + grp = rcu_dereference_protected(tsk->rship->net_group, + lockdep_is_held(&tsk->rship->net_lock)); + if (grp) { + spin_lock(&grp->hdr.lock); + grp->rxtx_bytes -= tsk->rship->rxtx_bytes; + grp->hdr.nr_tasks--; + spin_unlock(&grp->hdr.lock); + put_net_group(grp); + RCU_INIT_POINTER(tsk->rship->net_group, NULL); + } + + if (reset) { + tsk->rship->rxtx_bytes = 0; + tsk->rship->rxtx_remote_bytes = 0; + tsk->rship->rx_dev_idx = -1; + tsk->rship->rx_dev_queue_idx = -1; + tsk->rship->nic_nid = -1; + tsk->rship->rx_dev_netns_cookie = 0; + } + + spin_unlock_irqrestore(&tsk->rship->net_lock, flags); +} + +static inline int remote_rxtx_process(struct net_relationship_req *req) +{ + struct task_relationship *rship; + struct task_struct *tsk; + unsigned long flags; + pid_t pid; + long diff; + + rcu_read_lock(); + + pid = req->net_rship_type == NET_RS_TYPE_RX ? req->rx_pid : req->tx_pid; + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (!tsk || !task_relationship_supported(tsk)) + goto out_unlock; + + rship = tsk->rship; + if (time_after(jiffies, rship->rxtx_remote_update_next)) { + diff = rship->rxtx_remote_buffer - rship->rxtx_remote_bytes / 2; + + spin_lock_irqsave(&rship->net_lock, flags); + rship->nic_nid = req->nic_nid; + if (req->net_rship_type == NET_RS_TYPE_RX) { + rship->rx_dev_idx = req->rx_dev_idx; + rship->rx_dev_queue_idx = req->rx_dev_queue_idx; + rship->rx_dev_netns_cookie = req->rx_dev_netns_cookie; + } + rship->rxtx_remote_bytes += diff; + rship->rxtx_remote_buffer = 0; + spin_unlock_irqrestore(&rship->net_lock, flags); + } + + rship->rxtx_remote_buffer += req->rxtx_bytes; + +out_unlock: + rcu_read_unlock(); + + return 0; +} + +int sched_net_relationship_submit(struct net_relationship_req *req) +{ + struct task_struct *rx_tsk, *tx_tsk, *dst_tsk; + struct net_group *rx_grp, *tx_grp; + int ret; + + if (req->net_rship_type == NET_RS_TYPE_RX || + req->net_rship_type == NET_RS_TYPE_TX) + return remote_rxtx_process(req); + + rcu_read_lock(); + + rx_tsk = find_task_by_pid_ns(req->rx_pid, &init_pid_ns); + tx_tsk = find_task_by_pid_ns(req->tx_pid, &init_pid_ns); + if (!rx_tsk || !tx_tsk) { + ret = -ESRCH; + goto out_unlock; + } + + if (!task_relationship_supported(rx_tsk) || + !task_relationship_supported(tx_tsk)) { + ret = -EPERM; + goto out_unlock; + } + + if (atomic_read(&rx_tsk->rship->cb.active) && + atomic_read(&tx_tsk->rship->cb.active)) { + ret = -EBUSY; + goto out_unlock; + } + + rx_grp = rcu_dereference(rx_tsk->rship->net_group); + tx_grp = rcu_dereference(tx_tsk->rship->net_group); + if (rx_grp && tx_grp) { + dst_tsk = rx_grp->hdr.nr_tasks >= tx_grp->hdr.nr_tasks ? + rx_tsk : tx_tsk; + } else if (rx_grp) { + dst_tsk = rx_tsk; + } else if (tx_grp) { + dst_tsk = tx_tsk; + } else { + dst_tsk = !atomic_read(&rx_tsk->rship->cb.active) ? + rx_tsk : tx_tsk; + } + + if (atomic_cmpxchg(&dst_tsk->rship->cb.active, 0, 1)) { + ret = -EBUSY; + goto out_unlock; + } + + memcpy(&dst_tsk->rship->cb.req, req, sizeof(*req)); + dst_tsk->rship->cb.src_pid = dst_tsk == rx_tsk ? + req->tx_pid : req->rx_pid; + task_work_add(dst_tsk, &dst_tsk->rship->cb.twork, TWA_RESUME); + ret = 0; + +out_unlock: + rcu_read_unlock(); + return ret; +} + +static void task_net_group(struct task_struct *curr, struct task_struct *src) +{ + struct net_group *src_grp, *curr_grp, *grp; + + double_lock_irq(&src->rship->net_lock, &curr->rship->net_lock); + curr_grp = rcu_dereference_protected(curr->rship->net_group, + lockdep_is_held(&curr->rship->net_lock)); + src_grp = rcu_dereference_protected(src->rship->net_group, + lockdep_is_held(&src->rship->net_lock)); + + if (!curr_grp) { + grp = kzalloc(sizeof(*grp), GFP_ATOMIC | __GFP_NOWARN); + if (!grp) + goto out_unlock; + + refcount_set(&grp->hdr.refcount, 1); + spin_lock_init(&grp->hdr.lock); + grp->hdr.gid = curr->pid; + grp->hdr.preferred_nid = NODE_MASK_NONE; + node_set(task_node(curr), grp->hdr.preferred_nid); + grp->hdr.nr_tasks = 1; + rcu_assign_pointer(curr->rship->net_group, grp); + curr_grp = rcu_dereference_protected(curr->rship->net_group, + lockdep_is_held(&curr->rship->net_lock)); + } + + if (curr_grp == src_grp) + goto out_unlock; + + if (!get_net_group(curr_grp)) + goto out_unlock; + + spin_lock(&curr_grp->hdr.lock); + curr_grp->hdr.nr_tasks++; + curr_grp->rxtx_bytes += src->rship->rxtx_bytes; + spin_unlock(&curr_grp->hdr.lock); + + if (src_grp) { + spin_lock(&src_grp->hdr.lock); + src_grp->hdr.nr_tasks--; + src_grp->rxtx_bytes -= src->rship->rxtx_bytes; + spin_unlock(&src_grp->hdr.lock); + put_net_group(src_grp); + } + + rcu_assign_pointer(src->rship->net_group, curr_grp); +out_unlock: + spin_unlock(&src->rship->net_lock); + spin_unlock_irq(&curr->rship->net_lock); +} + +static void task_rxtx_data_update(struct task_struct *tsk) +{ + struct net_group *grp; + long bytes_diff; + + spin_lock_irq(&tsk->rship->net_lock); + bytes_diff = tsk->rship->rxtx_buffer - + tsk->rship->rxtx_bytes / RXTX_BYTES_DECAY_RATIO; + tsk->rship->rxtx_bytes += bytes_diff; + tsk->rship->rxtx_buffer = 0; + tsk->rship->rxtx_update_next = jiffies + + msecs_to_jiffies(RXTX_BYTES_PERIOD_MS); + + grp = rcu_dereference_protected(tsk->rship->net_group, + lockdep_is_held(&tsk->rship->net_lock)); + if (grp) { + spin_lock(&grp->hdr.lock); + grp->rxtx_bytes += bytes_diff; + spin_unlock(&grp->hdr.lock); + } + + spin_unlock_irq(&tsk->rship->net_lock); +} + +static void task_net_relationship_work(struct callback_head *work) +{ + struct net_relationship_callback *ncb; + struct task_struct *curr = current; + struct net_relationship_req req; + struct task_struct *src; + + ncb = container_of(work, struct net_relationship_callback, twork); + req = ncb->req; + atomic_set(&ncb->active, 0); + + rcu_read_lock(); + src = find_task_by_pid_ns(ncb->src_pid, &init_pid_ns); + if (!src) { + rcu_read_unlock(); + return; + } + + if (!task_relationship_supported(src) || + !task_relationship_supported(curr)) { + rcu_read_unlock(); + return; + } + + /* prevent src going away */ + get_task_struct(src); + + rcu_read_unlock(); + + /* build net relationship */ + task_net_group(src, curr); + + if (time_after(jiffies, curr->rship->rxtx_update_next)) + task_rxtx_data_update(curr); + + if (time_after(jiffies, src->rship->rxtx_update_next)) + task_rxtx_data_update(src); + + double_lock_irq(&src->rship->net_lock, &curr->rship->net_lock); + curr->rship->rxtx_buffer += req.rxtx_bytes; + src->rship->rxtx_buffer += req.rxtx_bytes; + spin_unlock(&src->rship->net_lock); + spin_unlock_irq(&curr->rship->net_lock); + + put_task_struct(src); +} + +static int cmp_fault_stats(const void *a, const void *b) +{ + return ((struct fault_array_info *)b)->val - + ((struct fault_array_info *)a)->val; +} + +void numa_faults_update_and_sort(int nid, int new, + struct fault_array_info *stats) +{ + int nodes, i; + + if (!task_relationship_used()) + return; + + if (nid == first_online_node) { + for (i = 0; i < FAULT_NODES_MAX; i++) { + stats[i].nid = -1; + stats[i].val = 0; + } + } + + nodes = min(FAULT_NODES_MAX, num_online_nodes()); + if (new <= stats[nodes - 1].val) + return; + + stats[nodes - 1].nid = nid; + stats[nodes - 1].val = new; + sort(stats, nodes, sizeof(stats[0]), cmp_fault_stats, NULL); +} + +void sched_get_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args) +{ + struct net_group *ngrp; + + rcu_read_lock(); + + /* memory relationship */ + sched_get_mm_relationship(tsk, args); + + /* net relationship */ + ngrp = rcu_dereference(tsk->rship->net_group); + if (ngrp) { + args->net.comm.gid = ngrp->hdr.gid; + args->net.comm.nr_tasks = ngrp->hdr.nr_tasks; + args->net.comm.preferred_node = ngrp->hdr.preferred_nid; + args->net.grp_rxtx_bytes = ngrp->rxtx_bytes; + } + + rcu_read_unlock(); +} + +void sctl_sched_get_net_relationship(struct task_struct *tsk, + struct sctl_net_relationship_info *info) +{ + struct task_relationship *rship = tsk->rship; + struct net_group *grp; + + memset(info, 0, sizeof(*info)); + info->valid = true; + info->nic_nid = rship->nic_nid; + info->rx_dev_idx = rship->rx_dev_idx; + info->rx_dev_queue_idx = rship->rx_dev_queue_idx; + info->rx_dev_netns_cookie = rship->rx_dev_netns_cookie; + info->rxtx_remote_bytes = rship->rxtx_remote_bytes; + info->rxtx_bytes = rship->rxtx_bytes; + + info->grp_hdr.gid = NO_RSHIP; + + rcu_read_lock(); + + grp = rcu_dereference(rship->net_group); + if (grp) { + info->grp_hdr.gid = grp->hdr.gid; + info->grp_hdr.nr_tasks = grp->hdr.nr_tasks; + snprintf(info->grp_hdr.preferred_nid, SCTL_STR_MAX, "%*pbl", + nodemask_pr_args(&grp->hdr.preferred_nid)); + info->grp_rxtx_bytes = grp->rxtx_bytes; + } + + rcu_read_unlock(); +} + +void task_relationship_free(struct task_struct *tsk, bool reset) +{ + if (!task_relationship_used()) + return; + + put_task_net_group(tsk, reset); +} + +int sched_relationship_fork(struct task_struct *p) +{ + int i; + + p->rship = kzalloc(sizeof(struct task_relationship), GFP_KERNEL); + if (!p->rship) + return -ENOMEM; + + for (i = 0; i < FAULT_NODES_MAX; i++) + p->rship->faults.faults_ordered[i].nid = -1; + + p->rship->nic_nid = -1; + p->rship->rx_dev_idx = -1; + p->rship->rx_dev_queue_idx = -1; + + spin_lock_init(&p->rship->net_lock); + init_task_work(&p->rship->cb.twork, task_net_relationship_work); +#ifdef CONFIG_NUMA_BALANCING + p->rship->node_work.next = &p->rship->node_work; + init_task_work(&p->rship->node_work, task_preferred_node_work); +#endif + return 0; +} + +void sched_relationship_free(struct task_struct *p) +{ + kfree(p->rship); + p->rship = NULL; +} diff --git a/kernel/sched/relationship_ioctl.c b/kernel/sched/relationship_ioctl.c new file mode 100644 index 0000000000000000000000000000000000000000..229786961ec86ef2de01646a737b5c16bcde3dad --- /dev/null +++ b/kernel/sched/relationship_ioctl.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for support ioctl for schedluler + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include + +#include "sched.h" + +static int sched_ctl_open(struct inode *inode, struct file *filp) +{ + filp->private_data = NULL; + + return 0; +} + +static int sched_ctl_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int sched_ctrl_get_relationship(void __user *arg) +{ + struct sctl_get_relationship_args data; + struct task_struct *tsk; + pid_t pid; + + if (!task_relationship_used()) { + pr_err("task relationship disabled!\n"); + return -EPERM; + } + + if (copy_from_user(&data, arg, sizeof(data))) { + pr_err("fail to copy_from_user!\n"); + return -EFAULT; + } + + pid = data.tid; + + rcu_read_lock(); + + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!task_relationship_supported(tsk)) { + rcu_read_unlock(); + return -EPERM; + } + + sctl_sched_get_net_relationship(tsk, &data.nrsi); + sctl_sched_get_mem_relationship(tsk, &data.mrsi); + + rcu_read_unlock(); + + if (copy_to_user(arg, &data, sizeof(data))) { + pr_err("fail to copy_to_user!\n"); + return -EFAULT; + } + + return 0; +} + +static long sched_ctl_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + struct sched_ctl_data *data; + + if (_IOC_TYPE(cmd) != SCTL_IOC_MAGIC) + return -ENOTTY; + + if (_IOC_NR(cmd) > SCTL_IOC_MAXNR) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + data = filp->private_data; + + switch (cmd) { + case SCTL_GET_RSHIP: + ret = sched_ctrl_get_relationship((void __user *)(uintptr_t)arg); + break; + default: + ret = -EINVAL; + + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static long +sched_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)(uintptr_t)compat_ptr(arg); + return sched_ctl_ioctl(file, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static const struct file_operations sched_ctl_fops = { + .open = sched_ctl_open, + .release = sched_ctl_release, + .llseek = no_llseek, + .unlocked_ioctl = sched_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sched_ctl_compat_ioctl, +#endif +}; + +static struct miscdevice sched_ctl_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "relationship_ctrl", + .fops = &sched_ctl_fops, +}; + +static int __init sched_ctl_device_init(void) +{ + return misc_register(&sched_ctl_device); +}; + +device_initcall(sched_ctl_device_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6f934af7062f938c5a67f9b10a77d034e91868d..3b2fc472908a161c28f9fd327ca31e5d575f2715 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -76,6 +76,8 @@ #include "cpupri.h" #include "cpudeadline.h" +#include "numa_icon.h" +#include #include diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index fc51d6f0d447e5d81b1a9c42fca9a11c186a8fca..3afc3e354844b59d7f70daab890813b2623f54a6 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -444,6 +444,12 @@ class PrinterHelpers(Printer): 'struct cpumask_op_args', 'struct sched_migrate_ctx', 'struct sched_affine_ctx', + 'struct sched_migrate_node', + 'struct nodemask_op_args', + 'struct bpf_relationship_get_args', + 'struct bpf_relationship_set_args', + 'struct sched_preferred_node_ctx', + 'struct bpf_node_stats', ] known_types = { '...', @@ -496,6 +502,12 @@ class PrinterHelpers(Printer): 'struct cpumask_op_args', 'struct sched_migrate_ctx', 'struct sched_affine_ctx', + 'struct sched_migrate_node', + 'struct nodemask_op_args', + 'struct bpf_relationship_get_args', + 'struct bpf_relationship_set_args', + 'struct sched_preferred_node_ctx', + 'struct bpf_node_stats', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e241f8d4becd8553344c5cbb679ee3a6667b122a..5a153a1a8f18a4758864366630384d84b48b1eb0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3874,6 +3874,52 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * true yes, false no. + * + * int bpf_nodemask_op(struct nodemask_op_args *op, int len) + * Description + * A series of nodemask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **NODEMASK_EMPTY** + * nodes_empty(op->arg1) returned. + * **NODEMASK_NODE_ISSET** + * node_isset(op->arg1, op->arg2) returned + * **NODEMASK_NODES_CLEAR** + * 0 returned + * **NODEMASK_NODE_CLEAR** + * unset op->arg1 from op->arg2, 0 returned + * **NODEMASK_NODE_SET** + * set op->arg1 to op->arg2, 0 returned + * **NODEMASK_WEIGHT** + * nodes_weight(op->arg1) returned + * **NODEMASK_NODELIST_PARSE** + * str *op->arg1* to nodemask_t *op->arg2*, + * 0 on success, or a negative error in case of failure. + * **NODEMASK_TO_CPUMASK** + * nodemask_t *arg1* to cpumask_t *op->arg2*, 0 returned. + * **NODEMASK_ONLINE** + * set online nodes to nodemask_t *op->arg1*, 0 returned. + * Return + * View above. + * + * int bpf_get_task_relationship_stats(struct task_struct *tsk, struct bpf_map *map, struct bpf_relationship_get_args *stats) + * Description + * get relationship statistics of *tsk* and store in *stats*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_curr_preferred_node(struct bpf_relationship_set_args *args, int len) + * Description + * set current task preferred node. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len) + * Description + * get resource statistics of *nid* and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4046,6 +4092,10 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(nodemask_op), \ + FN(get_task_relationship_stats),\ + FN(sched_set_curr_preferred_node),\ + FN(get_node_stats), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h index 04b43c145fcd4d720260104294465a9634b49c82..3e9b41788637b468e4657fcd5b7b8af2eab05cec 100644 --- a/tools/lib/bpf/libbpf_sched.h +++ b/tools/lib/bpf/libbpf_sched.h @@ -16,6 +16,8 @@ #define __LIBBPF_LIBSCHED_H #include +#include +#include #include #include #include @@ -26,7 +28,7 @@ #define INVALID_PTR ((void *)(0UL)) #define getVal(P) \ ({ \ - typeof(P) val = 0; \ + typeof(P) val; \ bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ val; \ }) @@ -78,6 +80,119 @@ struct { __uint(max_entries, 1); } map_cpumask_info SEC(".maps"); +static struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct bpf_relationship_get_args); + __uint(max_entries, 1); +} map_rship_stats SEC(".maps"); + +static __always_inline void +libbpf_nodes_and(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODES_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void +libbpf_nodes_andnot(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODES_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void +libbpf_nodes_or(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODES_OR; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void libbpf_node_set(int nid, + nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODE_SET; + op.arg1 = &nid; + op.arg2 = nodes; + op.arg3 = INVALID_PTR; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void libbpf_node_clear(int nid, + nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODE_CLEAR; + op.arg1 = &nid; + op.arg2 = nodes; + op.arg3 = INVALID_PTR; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_node_isset(int nid, + nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODE_ISSET; + op.arg1 = &nid; + op.arg2 = nodes; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_nodemask_empty(nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_EMPTY; + op.arg1 = nodes; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_nodemask_to_cpumask(nodemask_t *nodes, + struct cpumask *cpus) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_TO_CPUMASK; + op.arg1 = nodes; + op.arg2 = cpus; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_nodes_online(nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_ONLINE; + op.arg1 = nodes; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, struct cpumask *src) { @@ -507,4 +622,47 @@ static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) return se_tag; } + +static __always_inline unsigned long libbpf_node_cfs_util_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.util); +} + +static __always_inline unsigned long libbpf_node_cfs_capacity_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.compute_capacity); +} + +static __always_inline unsigned int libbpf_node_weight_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.weight); +} + +static __always_inline int +libbpf_mem_preferred_nid(struct task_struct *tsk, nodemask_t *preferred_node) +{ + struct bpf_relationship_get_args *stats; + int key = 0; + int ret; + + stats = bpf_map_lookup_elem(&map_rship_stats, &key); + if (!stats) + return NUMA_NO_NODE; + + ret = bpf_get_task_relationship_stats(tsk, &map_rship_stats, stats); + if (ret) + return NUMA_NO_NODE; + + *preferred_node = getVal(stats->mm.comm.preferred_node); + return 0; +} #endif