diff --git a/0001-io_uring-io-wq-stop-setting-PF_NO_SETAFFINITY-on-io-.patch b/0001-io_uring-io-wq-stop-setting-PF_NO_SETAFFINITY-on-io-.patch new file mode 100644 index 0000000000000000000000000000000000000000..29f02709d2d980e7ef7920da685b08952ab8147b --- /dev/null +++ b/0001-io_uring-io-wq-stop-setting-PF_NO_SETAFFINITY-on-io-.patch @@ -0,0 +1,81 @@ +From c226e8f63fd32b161bba9b37ed525b8228b47b08 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Wed, 8 Mar 2023 07:18:51 -0700 +Subject: [PATCH] io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq + workers + +commit 01e68ce08a30db3d842ce7a55f7f6e0474a55f9a + +Every now and then reports come in that are puzzled on why changing +affinity on the io-wq workers fails with EINVAL. This happens because they +set PF_NO_SETAFFINITY as part of their creation, as io-wq organizes +workers into groups based on what CPU they are running on. + +However, this is purely an optimization and not a functional requirement. +We can allow setting affinity, and just lazily update our worker to wqe +mappings. If a given io-wq thread times out, it normally exits if there's +no more work to do. The exception is if it's the last worker available. +For the timeout case, check the affinity of the worker against group mask +and exit even if it's the last worker. New workers should be created with +the right mask and in the right location. + +Reported-by :Daniel Dao +Link: https://lore.kernel.org/io-uring/CA+wXwBQwgxB3_UphSny-yAP5b26meeOu1W4TwYVcD_+5gOhvPw@mail.gmail.com/ +Signed-off-by: Jens Axboe +Signed-off-by: Li Lingfeng +Signed-off-by: Yue Haibing +--- + io_uring/io-wq.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c +index 3970e8047..adc39385e 100644 +--- a/io_uring/io-wq.c ++++ b/io_uring/io-wq.c +@@ -628,7 +628,7 @@ static int io_wqe_worker(void *data) + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; +- bool last_timeout = false; ++ bool exit_mask = false, last_timeout = false; + char buf[TASK_COMM_LEN]; + + worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); +@@ -644,8 +644,11 @@ static int io_wqe_worker(void *data) + io_worker_handle_work(worker); + + raw_spin_lock(&wqe->lock); +- /* timed out, exit unless we're the last worker */ +- if (last_timeout && acct->nr_workers > 1) { ++ /* ++ * Last sleep timed out. Exit if we're not the last worker, ++ * or if someone modified our affinity. ++ */ ++ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { + acct->nr_workers--; + raw_spin_unlock(&wqe->lock); + __set_current_state(TASK_RUNNING); +@@ -664,7 +667,11 @@ static int io_wqe_worker(void *data) + continue; + break; + } +- last_timeout = !ret; ++ if (!ret) { ++ last_timeout = true; ++ exit_mask = !cpumask_test_cpu(raw_smp_processor_id(), ++ wqe->cpu_mask); ++ } + } + + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) +@@ -716,7 +723,6 @@ static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, + tsk->worker_private = worker; + worker->task = tsk; + set_cpus_allowed_ptr(tsk, wqe->cpu_mask); +- tsk->flags |= PF_NO_SETAFFINITY; + + raw_spin_lock(&wqe->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); +-- +2.20.1 + diff --git a/0002-io_uring-sqpoll-Do-not-set-PF_NO_SETAFFINITY-on-sqpo.patch b/0002-io_uring-sqpoll-Do-not-set-PF_NO_SETAFFINITY-on-sqpo.patch new file mode 100644 index 0000000000000000000000000000000000000000..363c7d523daf75cbf7d1bc85d5447ebd1382f7b3 --- /dev/null +++ b/0002-io_uring-sqpoll-Do-not-set-PF_NO_SETAFFINITY-on-sqpo.patch @@ -0,0 +1,51 @@ +From 00217e9a015b1d2fea9deccd0311be4758cc0d7e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Michal=20Koutn=C3=BD?= +Date: Mon, 26 May 2025 14:38:04 +0800 +Subject: [PATCH] io_uring/sqpoll: Do not set PF_NO_SETAFFINITY on sqpoll + threads + +commit a5fc1441af7719e93dc7a638a960befb694ade89 + +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +mainline inclusion +from mainline-v6.3-rc3 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IC6ES1 + +-------------------------------- + +Users may specify a CPU where the sqpoll thread would run. This may +conflict with cpuset operations because of strict PF_NO_SETAFFINITY +requirement. That flag is unnecessary for polling "kernel" threads, see +the reasoning in commit 01e68ce08a30 ("io_uring/io-wq: stop setting +PF_NO_SETAFFINITY on io-wq workers"). Drop the flag on poll threads too. + +Fixes: 01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers") +Link: https://lore.kernel.org/all/20230314162559.pnyxdllzgw7jozgx@blackpad/ +Signed-off-by: Michal Koutný +Link: https://lore.kernel.org/r/20230314183332.25834-1-mkoutny@suse.com +Signed-off-by: Jens Axboe +Signed-off-by: Li Lingfeng +Signed-off-by: Yue Haibing +--- + io_uring/sqpoll.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c +index a0bb6142a..d152108c5 100644 +--- a/io_uring/sqpoll.c ++++ b/io_uring/sqpoll.c +@@ -239,7 +239,6 @@ static int io_sq_thread(void *data) + set_cpus_allowed_ptr(current, cpu_online_mask); + sqd->sq_cpu = raw_smp_processor_id(); + } +- current->flags |= PF_NO_SETAFFINITY; + + /* + * Force audit context to get setup, in case we do prep side async +-- +2.20.1 + diff --git a/0003-io_uring-Support-forcing-sq-thread-to-be-idle-and-wo.patch b/0003-io_uring-Support-forcing-sq-thread-to-be-idle-and-wo.patch new file mode 100644 index 0000000000000000000000000000000000000000..f33721519a710b66f63393e1bbc21d53f9d7f09f --- /dev/null +++ b/0003-io_uring-Support-forcing-sq-thread-to-be-idle-and-wo.patch @@ -0,0 +1,296 @@ +From a2505e287bd260f975786e72e56f22165ae25fe5 Mon Sep 17 00:00:00 2001 +From: ChenZhen +Date: Fri, 11 Jul 2025 11:22:11 +0800 +Subject: [PATCH] io_uring: Support forcing sq thread to be idle and woken up + by hrtimer + +This patch adds one option IORING_SETUP_SQ_THREAD_IDLE for io_uring user +program to reduce cpu usage of sq thread. +When enabled, sq polling thread will try to be idle. a hrtimer will be +created to wake up the sq thread periodically, the period can be set by +io_uring_params.sq_thread_wakeup_period(unit: us, default 10ms). + +Signed-off-by: ChenZhen +--- + include/linux/io_uring_types.h | 2 ++ + include/uapi/linux/io_uring.h | 18 ++++++++++ + io_uring/io_uring.c | 24 +++++++++++-- + io_uring/sqpoll.c | 63 ++++++++++++++++++++++++++++++++-- + io_uring/sqpoll.h | 5 ++- + 5 files changed, 105 insertions(+), 7 deletions(-) + +diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h +index 37aeea266ebb..93a68a50405d 100644 +--- a/include/linux/io_uring_types.h ++++ b/include/linux/io_uring_types.h +@@ -197,6 +197,7 @@ struct io_ring_ctx { + + struct io_rings *rings; + unsigned int flags; ++ unsigned int ext_flags; + enum task_work_notify_mode notify_method; + unsigned int compat: 1; + unsigned int drain_next: 1; +@@ -350,6 +351,7 @@ struct io_ring_ctx { + + struct list_head defer_list; + unsigned sq_thread_idle; ++ ktime_t sq_thread_wakeup_period; + /* protected by ->completion_lock */ + unsigned evfd_last_cq_tail; + }; +diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h +index 434f62e0fb72..e011a5f16481 100644 +--- a/include/uapi/linux/io_uring.h ++++ b/include/uapi/linux/io_uring.h +@@ -172,6 +172,13 @@ enum { + * try to do it just before it is needed. + */ + #define IORING_SETUP_DEFER_TASKRUN (1U << 13) ++#define IORING_SETUP_EXT_PARAM (1U << 31) /* extended param */ ++ ++/* ++ * io_uring_setup() extended flags ++ */ ++/* Force SQ thread to be idle, waiting for periodic wake-up */ ++#define IORING_SETUP_SQ_THREAD_FORCE_IDLE (1U << 0) + + enum io_uring_op { + IORING_OP_NOP, +@@ -454,6 +461,17 @@ struct io_uring_params { + struct io_cqring_offsets cq_off; + }; + ++struct io_uring_params_ext { ++ __u32 flags; ++ __u32 sq_thread_wakeup_period; ++ __u32 resv[6]; ++}; ++ ++struct io_uring_params_full { ++ struct io_uring_params p; ++ struct io_uring_params_ext ext_p; ++}; ++ + /* + * io_uring_params->features flags + */ +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 353c35987b06..ce7b5f17e4cf 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -3455,6 +3455,7 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) + } + + static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ++ struct io_uring_params_ext *ext_p, + struct io_uring_params __user *params) + { + struct io_ring_ctx *ctx; +@@ -3512,6 +3513,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + !(ctx->flags & IORING_SETUP_SQPOLL)) + ctx->syscall_iopoll = 1; + ++ ctx->ext_flags = ext_p->flags; + ctx->compat = in_compat_syscall(); + if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) + ctx->user = get_uid(current_user()); +@@ -3560,7 +3562,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + if (ret) + goto err; + +- ret = io_sq_offload_create(ctx, p); ++ ret = io_sq_offload_create(ctx, p, ext_p); + if (ret) + goto err; + /* always set a rsrc node */ +@@ -3636,6 +3638,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + static long io_uring_setup(u32 entries, struct io_uring_params __user *params) + { + struct io_uring_params p; ++ struct io_uring_params_ext ext_p = {}; + int i; + + if (copy_from_user(&p, params, sizeof(p))) +@@ -3651,10 +3654,25 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | +- IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) ++ IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | ++ IORING_SETUP_EXT_PARAM)) + return -EINVAL; + +- return io_uring_create(entries, &p, params); ++ if (p.flags & IORING_SETUP_EXT_PARAM) { ++ if (copy_from_user(&ext_p, (void __user *)params + ++ offsetof(struct io_uring_params_full, ext_p), ++ sizeof(ext_p))) ++ return -EFAULT; ++ for (i = 0; i < ARRAY_SIZE(ext_p.resv); i++) { ++ if (ext_p.resv[i]) ++ return -EINVAL; ++ } ++ ++ if (ext_p.flags & ~(IORING_SETUP_SQ_THREAD_FORCE_IDLE)) ++ return -EINVAL; ++ } ++ ++ return io_uring_create(entries, &p, &ext_p, params); + } + + SYSCALL_DEFINE2(io_uring_setup, u32, entries, +diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c +index a0bb6142afbb..049fee9ae8da 100644 +--- a/io_uring/sqpoll.c ++++ b/io_uring/sqpoll.c +@@ -84,6 +84,25 @@ static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) + sqd->sq_thread_idle = sq_thread_idle; + } + ++static __cold void io_sqd_update_wakeup_period(struct io_sq_data *sqd) ++{ ++ struct io_ring_ctx *ctx; ++ ktime_t sq_thread_wakeup_period = 0; ++ ++ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { ++ if (!(ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE)) ++ continue; ++ ++ if (!sq_thread_wakeup_period) { ++ sq_thread_wakeup_period = ctx->sq_thread_wakeup_period; ++ continue; ++ } ++ sq_thread_wakeup_period = min(sq_thread_wakeup_period, ++ ctx->sq_thread_wakeup_period); ++ } ++ WRITE_ONCE(sqd->sq_thread_wakeup_period, sq_thread_wakeup_period); ++} ++ + void io_sq_thread_finish(struct io_ring_ctx *ctx) + { + struct io_sq_data *sqd = ctx->sq_data; +@@ -92,6 +111,9 @@ void io_sq_thread_finish(struct io_ring_ctx *ctx) + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); ++ io_sqd_update_wakeup_period(sqd); ++ if (!sqd->sq_thread_wakeup_period) ++ hrtimer_cancel(&sqd->timer); + io_sq_thread_unpark(sqd); + + io_put_sq_data(sqd); +@@ -156,6 +178,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, + mutex_init(&sqd->lock); + init_waitqueue_head(&sqd->wait); + init_completion(&sqd->exited); ++ hrtimer_init(&sqd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + return sqd; + } + +@@ -251,7 +274,7 @@ static int io_sq_thread(void *data) + + mutex_lock(&sqd->lock); + while (1) { +- bool cap_entries, sqt_spin = false; ++ bool cap_entries, sqt_spin = false, force_idle = false; + + if (io_sqd_events_pending(sqd) || signal_pending(current)) { + if (io_sqd_handle_event(sqd)) +@@ -263,13 +286,18 @@ static int io_sq_thread(void *data) + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + int ret = __io_sq_thread(ctx, cap_entries); + ++ if (ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE) { ++ force_idle = true; ++ continue; ++ } ++ + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) + sqt_spin = true; + } + if (io_run_task_work()) + sqt_spin = true; + +- if (sqt_spin || !time_after(jiffies, timeout)) { ++ if (!force_idle && (sqt_spin || !time_after(jiffies, timeout))) { + if (sqt_spin) + timeout = jiffies + sqd->sq_thread_idle; + if (unlikely(need_resched())) { +@@ -350,8 +378,21 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) + return 0; + } + ++static enum hrtimer_restart sq_thread_hrtimer_fn(struct hrtimer *timer) ++{ ++ struct io_sq_data *sqd = container_of(timer, struct io_sq_data, timer); ++ ktime_t sq_thread_wakeup_period; ++ ++ sq_thread_wakeup_period = READ_ONCE(sqd->sq_thread_wakeup_period); ++ wake_up(&sqd->wait); ++ hrtimer_forward_now(timer, sq_thread_wakeup_period); ++ ++ return HRTIMER_RESTART; ++} ++ + __cold int io_sq_offload_create(struct io_ring_ctx *ctx, +- struct io_uring_params *p) ++ struct io_uring_params *p, ++ struct io_uring_params_ext *ext_p) + { + int ret; + +@@ -390,9 +431,19 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + ++ if (ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE) { ++ /* reset to make this ctx skip updating sq thread idle */ ++ ctx->sq_thread_idle = 0; ++ ctx->sq_thread_wakeup_period = ++ ns_to_ktime((u64)ext_p->sq_thread_wakeup_period * NSEC_PER_USEC); ++ if (!ctx->sq_thread_wakeup_period) ++ ctx->sq_thread_wakeup_period = ns_to_ktime(10 * NSEC_PER_MSEC); ++ } ++ + io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); ++ io_sqd_update_wakeup_period(sqd); + /* don't attach to a dying SQPOLL thread, would be racy */ + ret = (attached && !sqd->thread) ? -ENXIO : 0; + io_sq_thread_unpark(sqd); +@@ -426,6 +477,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, + wake_up_new_task(tsk); + if (ret) + goto err; ++ ++ if (ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE) { ++ sqd->timer.function = sq_thread_hrtimer_fn; ++ hrtimer_start(&sqd->timer, READ_ONCE(sqd->sq_thread_wakeup_period), ++ HRTIMER_MODE_REL); ++ } + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ + ret = -EINVAL; +diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h +index 36245f1afa5e..f8bbbc336ae6 100644 +--- a/io_uring/sqpoll.h ++++ b/io_uring/sqpoll.h +@@ -18,9 +18,12 @@ struct io_sq_data { + + unsigned long state; + struct completion exited; ++ ktime_t sq_thread_wakeup_period; ++ struct hrtimer timer; + }; + +-int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p); ++int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p, ++ struct io_uring_params_ext *ext_p); + void io_sq_thread_finish(struct io_ring_ctx *ctx); + void io_sq_thread_stop(struct io_sq_data *sqd); + void io_sq_thread_park(struct io_sq_data *sqd); +-- +2.33.0 + diff --git a/0004-sched-support-soft-domain.patch b/0004-sched-support-soft-domain.patch new file mode 100644 index 0000000000000000000000000000000000000000..7302e017a65732c3310d6fd4adc5ac9fe8a24355 --- /dev/null +++ b/0004-sched-support-soft-domain.patch @@ -0,0 +1,1086 @@ +From 168595cbb0852fc92e18e960309da88bddd40a3a Mon Sep 17 00:00:00 2001 +From: Zhang Qiao +Date: Mon, 30 Jun 2025 15:10:48 +0800 +Subject: [PATCH] sched: support soft domain + +On Kunpeng server, each LLC domain contains multiple clusters. When +multiple services are deployed within the same LLC domain, their tasks +become distributed across all clusters. This results in: + +1. High cache synchronization overheadbetween different tasks of the + same service. +2. Severe cache contention among tasks from different services. + +The Soft Domain architecture partitions resources by clusters. Under +low-load conditions, each service operates exclusively within its +dedicated domain to prevent cross-service interference, thereby +enhancing both CPU isolation and improving cache locality. + +Signed-off-by: Zhang Qiao +--- + config.aarch64 | 2 + + config.aarch64-64k | 2 + + config.x86_64 | 1 + + include/linux/sched/topology.h | 21 ++ + init/Kconfig | 12 + + kernel/sched/Makefile | 1 + + kernel/sched/core.c | 86 ++++++ + kernel/sched/fair.c | 146 +++++++++- + kernel/sched/features.h | 4 + + kernel/sched/sched.h | 41 +++ + kernel/sched/soft_domain.c | 500 +++++++++++++++++++++++++++++++++ + 11 files changed, 815 insertions(+), 1 deletion(-) + create mode 100644 kernel/sched/soft_domain.c + +diff --git a/config.aarch64 b/config.aarch64 +index fcff88edb..51b199bc0 100644 +--- a/config.aarch64 ++++ b/config.aarch64 +@@ -150,6 +150,7 @@ CONFIG_CGROUP_SCHED=y + CONFIG_FAIR_GROUP_SCHED=y + CONFIG_CFS_BANDWIDTH=y + # CONFIG_RT_GROUP_SCHED is not set ++CONFIG_SCHED_SOFT_DOMAIN=y + CONFIG_CGROUP_PIDS=y + CONFIG_CGROUP_RDMA=y + CONFIG_CGROUP_FREEZER=y +@@ -383,6 +384,7 @@ CONFIG_ARM64_PA_BITS=48 + # CONFIG_CPU_BIG_ENDIAN is not set + CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_SCHED_MC=y ++CONFIG_SCHED_CLUSTER=y + CONFIG_SCHED_SMT=y + CONFIG_NR_CPUS=1024 + CONFIG_HOTPLUG_CPU=y +diff --git a/config.aarch64-64k b/config.aarch64-64k +index 41daa7820..cb0999f6c 100644 +--- a/config.aarch64-64k ++++ b/config.aarch64-64k +@@ -150,6 +150,7 @@ CONFIG_CGROUP_SCHED=y + CONFIG_FAIR_GROUP_SCHED=y + CONFIG_CFS_BANDWIDTH=y + # CONFIG_RT_GROUP_SCHED is not set ++CONFIG_SCHED_SOFT_DOMAIN=y + CONFIG_CGROUP_PIDS=y + CONFIG_CGROUP_RDMA=y + CONFIG_CGROUP_FREEZER=y +@@ -385,6 +386,7 @@ CONFIG_ARM64_PA_BITS=48 + # CONFIG_CPU_BIG_ENDIAN is not set + CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_SCHED_MC=y ++CONFIG_SCHED_CLUSTER=y + CONFIG_SCHED_SMT=y + CONFIG_NR_CPUS=1024 + CONFIG_HOTPLUG_CPU=y +diff --git a/config.x86_64 b/config.x86_64 +index e5908b4a5..2b2506c45 100644 +--- a/config.x86_64 ++++ b/config.x86_64 +@@ -172,6 +172,7 @@ CONFIG_CGROUP_SCHED=y + CONFIG_FAIR_GROUP_SCHED=y + CONFIG_CFS_BANDWIDTH=y + # CONFIG_RT_GROUP_SCHED is not set ++# CONFIG_SCHED_SOFT_DOMAIN is not set + CONFIG_CGROUP_PIDS=y + # CONFIG_CGROUP_RDMA is not set + CONFIG_CGROUP_FREEZER=y +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index ce703cae4..01f855335 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -77,6 +77,27 @@ extern int sched_domain_level_max; + + struct sched_group; + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ ++struct soft_subdomain { ++ /* the count of task group attached this sub domain. */ ++ int attached; ++ struct list_head node; ++ unsigned long span[]; ++}; ++ ++/* ++ * Each LLC builds a soft domain: ++ * A soft scheduling domain is divided into multiple subdomains, ++ * typically based on the physical structure of CPU clusters. ++ */ ++struct soft_domain { ++ struct list_head child_domain; ++ int nr_available_cpus; ++ unsigned long span[]; ++}; ++#endif ++ + struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; +diff --git a/init/Kconfig b/init/Kconfig +index dabb28d8a..896773c70 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1033,6 +1033,18 @@ config RT_GROUP_SCHED + + endif #CGROUP_SCHED + ++config SCHED_SOFT_DOMAIN ++ bool "Soft domain scheduler" ++ depends on FAIR_GROUP_SCHED ++ depends on SCHED_CLUSTER ++ default n ++ help ++ This feature builds a CPU soft domain for each task group. Tasks are ++ prioritized and aggregated to execute within soft domains, which optimizes ++ resource allocation and enhances cache locality. ++ ++ If in doubt, say N. ++ + config UCLAMP_TASK_GROUP + bool "Utilization clamping per group of tasks" + depends on CGROUP_SCHED +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 978fcfca5..df671145b 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o + obj-$(CONFIG_CPU_ISOLATION) += isolation.o + obj-$(CONFIG_PSI) += psi.o + obj-$(CONFIG_SCHED_CORE) += core_sched.o ++obj-$(CONFIG_SCHED_SOFT_DOMAIN) += soft_domain.o +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index ee817573a..c81c03aa9 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -9320,6 +9320,9 @@ void __init sched_init_smp(void) + init_sched_dl_class(); + + sched_smp_initialized = true; ++ ++ build_soft_domain(); ++ + } + + static int __init migration_init(void) +@@ -10009,6 +10012,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) + return 0; + } + ++static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ offline_soft_domain(tg); ++} ++ + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) + { + struct task_group *tg = css_tg(css); +@@ -10680,6 +10690,62 @@ static int cpu_override_proc_write_s64(struct cgroup_subsys_state *css, + #endif + #endif + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ ++static int cpu_soft_domain_write_s64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, ++ s64 val) ++{ ++ return sched_group_set_soft_domain(css_tg(css), val); ++} ++ ++static s64 cpu_soft_domain_read_s64(struct cgroup_subsys_state *css, ++ struct cftype *cftype) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ if (!tg->sf_ctx) ++ return 0; ++ ++ return (s64)tg->sf_ctx->policy; ++} ++ ++static int cpu_soft_domain_quota_write_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, u64 val) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ if (val > cpumask_weight(cpumask_of_node(0))) ++ return -EINVAL; ++ ++ return sched_group_set_soft_domain_quota(tg, val); ++} ++ ++static u64 cpu_soft_domain_quota_read_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ if (!tg->sf_ctx) ++ return 0; ++ ++ return (u64)tg->sf_ctx->nr_cpus; ++} ++ ++static int soft_domain_cpu_list_seq_show(struct seq_file *sf, void *v) ++{ ++ struct task_group *tg = css_tg(seq_css(sf)); ++ ++ if (!tg->sf_ctx) ++ return 0; ++ ++ seq_printf(sf, "%*pbl\n", cpumask_pr_args(to_cpumask(tg->sf_ctx->span))); ++ ++ return 0; ++} ++ ++#endif ++ + static struct cftype cpu_legacy_files[] = { + #ifdef CONFIG_FAIR_GROUP_SCHED + { +@@ -10700,6 +10766,25 @@ static struct cftype cpu_legacy_files[] = { + .write_s64 = cpu_override_proc_write_s64, + }, + #endif ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ { ++ .name = "soft_domain", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_s64 = cpu_soft_domain_read_s64, ++ .write_s64 = cpu_soft_domain_write_s64, ++ }, ++ { ++ .name = "soft_domain_nr_cpu", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_u64 = cpu_soft_domain_quota_read_u64, ++ .write_u64 = cpu_soft_domain_quota_write_u64, ++ }, ++ { ++ .name = "soft_domain_cpu_list", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = soft_domain_cpu_list_seq_show, ++ }, ++#endif + #ifdef CONFIG_CFS_BANDWIDTH + { + .name = "cfs_quota_us", +@@ -10951,6 +11036,7 @@ static struct cftype cpu_files[] = { + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, ++ .css_offline = cpu_cgroup_css_offline, + .css_released = cpu_cgroup_css_released, + .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 622c3e99e..b9e3a63ad 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6196,6 +6196,55 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, + static struct sched_group * + find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++static inline bool sched_group_sf_preferred(struct task_struct *p, struct sched_group *group) ++{ ++ struct soft_domain_ctx *ctx = NULL; ++ ++ if (!sched_feat(SOFT_DOMAIN)) ++ return true; ++ ++ ctx = task_group(p)->sf_ctx; ++ if (!ctx || ctx->policy == 0) ++ return true; ++ ++ if (!cpumask_intersects(sched_group_span(group), to_cpumask(ctx->span))) ++ return false; ++ ++ return true; ++} ++ ++static inline bool cpu_is_sf_preferred(struct task_struct *p, int cpu) ++{ ++ struct soft_domain_ctx *ctx = NULL; ++ ++ if (!sched_feat(SOFT_DOMAIN)) ++ return true; ++ ++ ctx = task_group(p)->sf_ctx; ++ if (!ctx || ctx->policy == 0) ++ return true; ++ ++ if (!cpumask_test_cpu(cpu, to_cpumask(ctx->span))) ++ return false; ++ ++ return true; ++} ++#else ++ ++static inline bool sched_group_sf_preferred(struct task_struct *p, struct sched_group *group) ++{ ++ return true; ++} ++ ++static inline bool cpu_is_sf_preferred(struct task_struct *p, int cpu) ++{ ++ return true; ++} ++ ++#endif ++ ++ + /* + * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + */ +@@ -6220,6 +6269,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this + if (!sched_core_cookie_match(rq, p)) + continue; + ++ if (!cpu_is_sf_preferred(p, i)) ++ continue; ++ + if (available_idle_cpu(i)) { + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { +@@ -6534,6 +6586,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + } + } + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ if (sched_feat(SOFT_DOMAIN)) { ++ struct task_group *tg = task_group(p); ++ ++ if (tg->sf_ctx && tg->sf_ctx->policy != 0) { ++ struct cpumask *tmpmask = to_cpumask(tg->sf_ctx->span); ++ ++ for_each_cpu_wrap(cpu, tmpmask, target + 1) { ++ if (!cpumask_test_cpu(cpu, cpus)) ++ continue; ++ ++ if (has_idle_core) { ++ i = select_idle_core(p, cpu, cpus, &idle_cpu); ++ if ((unsigned int)i < nr_cpumask_bits) ++ return i; ++ ++ } else { ++ if (--nr <= 0) ++ return -1; ++ i = __select_idle_cpu(cpu, p, &idle_cpu); ++ if ((unsigned int)i < nr_cpumask_bits) ++ return i; ++ } ++ } ++ ++ if (idle_cpu != -1) ++ return idle_cpu; ++ ++ cpumask_andnot(cpus, cpus, tmpmask); ++ } ++ ++ } ++#endif ++ + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + +@@ -7193,6 +7279,36 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + return target; + } + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++static int wake_soft_domain(struct task_struct *p, int target, int *cpu, int sd_flags) ++{ ++ struct cpumask *mask = this_cpu_cpumask_var_ptr(select_idle_mask); ++ struct soft_domain_ctx *ctx = NULL; ++ ++ ctx = task_group(p)->sf_ctx; ++ if (!ctx || ctx->policy == 0) ++ goto out; ++ ++#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE ++ cpumask_and(mask, to_cpumask(ctx->span), p->select_cpus); ++#else ++ cpumask_and(mask, to_cpumask(ctx->span), p->cpus_ptr); ++#endif ++ cpumask_and(mask, mask, cpu_active_mask); ++ if (cpumask_empty(mask) || cpumask_test_cpu(target, mask)) ++ goto prefer; ++ else ++ target = cpumask_any_and_distribute(mask, mask); ++ ++prefer: ++ if (sd_flags & SD_BALANCE_FORK) ++ *cpu = target; ++out: ++ ++ return target; ++} ++#endif ++ + /* + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, +@@ -7232,6 +7348,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + } + + rcu_read_lock(); ++ ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ if (sched_feat(SOFT_DOMAIN)) ++ new_cpu = prev_cpu = wake_soft_domain(p, prev_cpu, &cpu, sd_flag); ++#endif ++ + for_each_domain(cpu, tmp) { + /* + * If both 'cpu' and 'prev_cpu' are part of this domain, +@@ -8109,6 +8231,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ /* Do not migrate soft domain tasks to outside of prefer cluster. */ ++ if (sched_feat(SOFT_DOMAIN)) { ++ struct soft_domain_ctx *ctx = task_group(p)->sf_ctx; ++ ++ if (ctx && ctx->policy && ++ !cpumask_test_cpu(env->dst_cpu, to_cpumask(ctx->span))) ++ return 0; ++ } ++#endif ++ + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; +@@ -9409,6 +9542,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + ++ /* Skip over this group if not in soft domain */ ++ if (!sched_group_sf_preferred(p, group)) ++ continue; ++ + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + +@@ -11859,6 +11996,8 @@ void free_fair_sched_group(struct task_group *tg) + { + int i; + ++ destroy_soft_domain(tg); ++ + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); +@@ -11874,7 +12013,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) + { + struct sched_entity *se; + struct cfs_rq *cfs_rq; +- int i; ++ int i, ret; + + tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); + if (!tg->cfs_rq) +@@ -11891,6 +12030,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + ++ ret = init_soft_domain(tg, parent); ++ if (ret) ++ goto err; ++ + for_each_possible_cpu(i) { + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); +@@ -11912,6 +12055,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) + err_free_rq: + kfree(cfs_rq); + err: ++ destroy_soft_domain(tg); + return 0; + } + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 1b0979005..9d0d29a12 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -108,3 +108,7 @@ SCHED_FEAT(BASE_SLICE, true) + */ + SCHED_FEAT(PREFER_HIGH_WEIGHT, true) + #endif ++ ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++SCHED_FEAT(SOFT_DOMAIN, false) ++#endif +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 268760576..abd80048c 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -386,6 +386,16 @@ struct cfs_bandwidth { + #endif + }; + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ ++struct soft_domain_ctx { ++ int policy; ++ int nr_cpus; ++ struct soft_domain *sf_d; ++ unsigned long span[]; ++}; ++#endif ++ + /* Task group related information */ + struct task_group { + struct cgroup_subsys_state css; +@@ -441,6 +451,9 @@ struct task_group { + struct uclamp_se uclamp[UCLAMP_CNT]; + #endif + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++ struct soft_domain_ctx *sf_ctx; ++#endif + }; + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -3190,3 +3203,31 @@ extern int sched_dynamic_mode(const char *str); + extern void sched_dynamic_update(int mode); + #endif + ++#ifdef CONFIG_SCHED_SOFT_DOMAIN ++void build_soft_domain(void); ++int init_soft_domain(struct task_group *tg, struct task_group *parent); ++int destroy_soft_domain(struct task_group *tg); ++void offline_soft_domain(struct task_group *tg); ++int sched_group_set_soft_domain(struct task_group *tg, long val); ++int sched_group_set_soft_domain_quota(struct task_group *tg, long val); ++ ++static inline struct cpumask *soft_domain_span(unsigned long span[]) ++{ ++ return to_cpumask(span); ++} ++#else ++ ++static inline void build_soft_domain(void) { } ++static inline int init_soft_domain(struct task_group *tg, struct task_group *parent) ++{ ++ return 0; ++} ++ ++static inline void offline_soft_domain(struct task_group *tg) { } ++ ++static inline int destroy_soft_domain(struct task_group *tg) ++{ ++ return 0; ++} ++ ++#endif +diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c +new file mode 100644 +index 000000000..0ed239cdb +--- /dev/null ++++ b/kernel/sched/soft_domain.c +@@ -0,0 +1,500 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++/* ++ * Common code for Soft Domain Scheduling ++ * ++ * Copyright (C) 2025-2025 Huawei Technologies Co., Ltd ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ */ ++ ++#include "sched.h" ++#include ++ ++static DEFINE_STATIC_KEY_TRUE(__soft_domain_switch); ++ ++static int __init soft_domain_switch_setup(char *str) ++{ ++ int val = 0; ++ ++ if (kstrtoint(str, 0, &val)) ++ pr_warn("sched_soft_domain parameter is error: %s\n", str); ++ else { ++ if (val == 1) ++ static_branch_enable(&__soft_domain_switch); ++ else if (val == 0) ++ static_branch_disable(&__soft_domain_switch); ++ } ++ ++ return 1; ++} ++__setup("sched_soft_domain=", soft_domain_switch_setup); ++ ++static bool soft_domain_enabled(void) ++{ ++ return static_branch_likely(&__soft_domain_switch); ++} ++ ++static DEFINE_PER_CPU(struct soft_domain *, g_sf_d); ++ ++static void free_sub_soft_domain(struct soft_domain *sf_d); ++ ++static int build_soft_sub_domain(int nid, struct cpumask *cpus) ++{ ++ struct cpumask *span = cpumask_of_node(nid); ++ struct soft_domain *sf_d = NULL; ++ int i; ++ ++ sf_d = kzalloc_node(sizeof(struct soft_domain) + cpumask_size(), ++ GFP_KERNEL, nid); ++ if (!sf_d) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&sf_d->child_domain); ++ sf_d->nr_available_cpus = cpumask_weight(span); ++ cpumask_copy(to_cpumask(sf_d->span), span); ++ ++ for_each_cpu_and(i, span, cpus) { ++ struct soft_subdomain *sub_d = NULL; ++ ++ sub_d = kzalloc_node(sizeof(struct soft_subdomain) + cpumask_size(), ++ GFP_KERNEL, nid); ++ if (!sub_d) { ++ free_sub_soft_domain(sf_d); ++ return -ENOMEM; ++ } ++ list_add_tail(&sub_d->node, &sf_d->child_domain); ++ cpumask_and(soft_domain_span(sub_d->span), span, topology_cluster_cpumask(i)); ++ cpumask_andnot(cpus, cpus, topology_cluster_cpumask(i)); ++ } ++ ++ for_each_cpu(i, span) { ++ rcu_assign_pointer(per_cpu(g_sf_d, i), sf_d); ++ } ++ ++ return 0; ++} ++ ++static void free_sub_soft_domain(struct soft_domain *sf_d) ++{ ++ struct list_head *children = &sf_d->child_domain; ++ struct soft_subdomain *entry = NULL, *next = NULL; ++ int i; ++ ++ list_for_each_entry_safe(entry, next, children, node) { ++ list_del(&entry->node); ++ kfree(entry); ++ } ++ ++ for_each_cpu(i, to_cpumask(sf_d->span)) { ++ rcu_assign_pointer(per_cpu(g_sf_d, i), NULL); ++ } ++ ++ kfree(sf_d); ++} ++ ++static void free_soft_domain(void) ++{ ++ struct soft_domain *sf_d = NULL; ++ int i; ++ ++ for_each_cpu(i, cpu_active_mask) { ++ sf_d = rcu_dereference(per_cpu(g_sf_d, i)); ++ if (sf_d) ++ free_sub_soft_domain(sf_d); ++ } ++ ++ static_branch_disable(&__soft_domain_switch); ++} ++ ++void build_soft_domain(void) ++{ ++ static struct cpumask cpus; ++ int i, ret; ++ ++ if (!soft_domain_enabled()) ++ return; ++ ++ cpumask_copy(&cpus, cpu_active_mask); ++ rcu_read_lock(); ++ for (i = 0; i < nr_node_ids; ++ i) { ++ /* build soft domain for each numa domain. */ ++ ret = build_soft_sub_domain(i, &cpus); ++ if (ret) { ++ free_soft_domain(); ++ goto out; ++ } ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static DEFINE_MUTEX(soft_domain_mutex); ++ ++#define NR_MAX_CLUSTER 16 ++ ++struct domain_node { ++ struct soft_subdomain *sud_d; ++ unsigned int attached; ++ int idx; ++ unsigned long util; ++}; ++ ++static int subdomain_cmp(const void *a, const void *b) ++{ ++ struct domain_node *ca = (struct domain_node *)a; ++ struct domain_node *cb = (struct domain_node *)b; ++ ++ if (ca->attached == cb->attached && ca->attached == 0) { ++ if (ca->idx < cb->idx) ++ return -1; ++ else ++ return 1; ++ } ++ ++ if (ca->attached < cb->attached || ++ (ca->attached == cb->attached && ca->util < cb->util)) ++ return -1; ++ ++ return 1; ++} ++ ++struct soft_domain_args { ++ int policy; ++ int nr_cpu; ++ struct cpumask *cpus; ++}; ++ ++static int tg_set_soft_domain(struct task_group *tg, void *data) ++{ ++ struct soft_domain_args *args = (struct soft_domain_args *)data; ++ ++ tg->sf_ctx->policy = args->policy; ++ if (args->policy) { ++ cpumask_copy(to_cpumask(tg->sf_ctx->span), args->cpus); ++ tg->sf_ctx->nr_cpus = args->nr_cpu; ++ } else ++ cpumask_clear(to_cpumask(tg->sf_ctx->span)); ++ ++ return 0; ++} ++ ++static int __calc_cpu(struct task_group *tg) ++{ ++ int nr_cpu = 1; ++ ++ if (tg->sf_ctx->nr_cpus) ++ nr_cpu = tg->sf_ctx->nr_cpus; ++#ifdef CONFIG_CFS_BANDWIDTH ++ else if (tg->cfs_bandwidth.quota != RUNTIME_INF) ++ nr_cpu = DIV_ROUND_UP_ULL(tg->cfs_bandwidth.quota, tg->cfs_bandwidth.period); ++#endif ++ ++ return nr_cpu; ++} ++ ++static unsigned long sum_util(struct cpumask *mask) ++{ ++ unsigned long sum = 0; ++ int cpu; ++ ++ for_each_cpu(cpu, mask) ++ sum += cpu_util_cfs(cpu_rq(cpu)); ++ ++ return sum; ++} ++ ++static int __check_policy(struct task_group *tg, void *data) ++{ ++ return !!tg->sf_ctx->policy; ++} ++ ++static int check_policy(struct task_group *tg, long policy) ++{ ++ int ret; ++ ++ rcu_read_lock(); ++ ret = walk_tg_tree_from(tg, __check_policy, tg_nop, NULL); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static struct soft_domain *find_idlest_llc(long policy, ++ int nr_cpu, cpumask_var_t cpus) ++{ ++ int cpu; ++ int max_cpu = 0; ++ struct soft_domain *idlest = NULL; ++ unsigned long min_util = ULONG_MAX; ++ ++ /* The user has specified the llc. */ ++ if (policy > 0) { ++ for_each_cpu(cpu, cpumask_of_node(policy-1)) { ++ idlest = rcu_dereference(per_cpu(g_sf_d, cpu)); ++ if (idlest != NULL) ++ break; ++ } ++ ++ if (idlest && nr_cpu <= cpumask_weight(to_cpumask(idlest->span))) ++ return idlest; ++ ++ return NULL; ++ } ++ ++ cpumask_copy(cpus, cpu_active_mask); ++ for_each_cpu(cpu, cpus) { ++ struct soft_domain *sf_d = NULL; ++ struct cpumask *mask; ++ ++ sf_d = rcu_dereference(per_cpu(g_sf_d, cpu)); ++ if (sf_d == NULL) ++ continue; ++ ++ mask = to_cpumask(sf_d->span); ++ cpumask_andnot(cpus, cpus, mask); ++ if (nr_cpu > cpumask_weight(mask)) ++ continue; ++ ++ /* ++ * LLC selection order: ++ * 1. When the number of idle cpus meet the requirements, ++ * the one with more idles cpus is better; ++ * 2. Under the condition of insufficient idle cpus, util ++ * is lower, the better. ++ */ ++ if (sf_d->nr_available_cpus > max_cpu && ++ nr_cpu <= sf_d->nr_available_cpus) { ++ max_cpu = sf_d->nr_available_cpus; ++ idlest = sf_d; ++ } else if (max_cpu == 0) { /* No llc meets the demand */ ++ unsigned long util = sum_util(mask); ++ ++ if (idlest == NULL || util < min_util) { ++ idlest = sf_d; ++ min_util = util; ++ } ++ } ++ } ++ ++ return idlest; ++} ++ ++static int __sched_group_set_soft_domain(struct task_group *tg, long policy) ++{ ++ int cpu; ++ int ret = 0; ++ cpumask_var_t cpus; ++ struct soft_domain_args args; ++ struct soft_domain *sf_d = NULL; ++ struct domain_node nodes[NR_MAX_CLUSTER] = {0}; ++ int nr_cpu = __calc_cpu(tg); ++ ++ if (check_policy(tg, policy)) ++ return -EINVAL; ++ ++ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ /* 1. Find a idlest llc. */ ++ sf_d = find_idlest_llc(policy, nr_cpu, cpus); ++ if (sf_d != NULL) { ++ /* 2. select idlest clusters. */ ++ struct list_head *children = &sf_d->child_domain; ++ struct soft_subdomain *sub_d = NULL; ++ int nr = 0, i; ++ struct cpumask *tmpmask = NULL; ++ int tmp_cpu = nr_cpu; ++ ++ list_for_each_entry(sub_d, children, node) { ++ nodes[nr].sud_d = sub_d; ++ nodes[nr].attached = sub_d->attached; ++ tmpmask = to_cpumask(sub_d->span); ++ cpu = cpumask_first(tmpmask); ++ nodes[nr].util = sum_util(tmpmask); ++ nodes[nr].idx = nr; ++ nr++; ++ } ++ ++ cpumask_clear(cpus); ++ ++ sort(nodes, nr, sizeof(struct domain_node), subdomain_cmp, NULL); ++ sf_d->nr_available_cpus -= tmp_cpu; ++ for (i = 0; i < nr; i++) { ++ sub_d = nodes[i].sud_d; ++ tmpmask = to_cpumask(sub_d->span); ++ cpumask_or(cpus, cpus, tmpmask); ++ sub_d->attached++; ++ nr_cpu -= cpumask_weight(tmpmask); ++ if (nr_cpu <= 0) ++ break; ++ } ++ ++ /* 3. attach task group to softdomain. */ ++ args.policy = policy; ++ args.cpus = cpus; ++ args.nr_cpu = tmp_cpu; ++ walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args); ++ ++ /* ++ * 4.add tg to llc domain task_groups list for load balance. ++ */ ++ tg->sf_ctx->nr_cpus = tmp_cpu; ++ tg->sf_ctx->sf_d = sf_d; ++ } else { ++ ret = -EINVAL; ++ } ++ rcu_read_unlock(); ++ ++ free_cpumask_var(cpus); ++ ++ return ret; ++} ++ ++static int __sched_group_unset_soft_domain(struct task_group *tg) ++{ ++ struct soft_domain_args args = { ++ .policy = 0, ++ }; ++ struct soft_domain *sf_d = NULL; ++ struct soft_subdomain *sub_d = NULL; ++ struct list_head *children = NULL; ++ ++ /* If parent has set soft domain, child group can't unset itself. */ ++ if (tg->parent->sf_ctx != NULL && tg->parent->sf_ctx->policy != 0) ++ return -EINVAL; ++ ++ sf_d = tg->sf_ctx->sf_d; ++ sf_d->nr_available_cpus += __calc_cpu(tg); ++ children = &sf_d->child_domain; ++ ++ list_for_each_entry(sub_d, children, node) { ++ if (cpumask_intersects(to_cpumask(tg->sf_ctx->span), to_cpumask(sub_d->span))) ++ sub_d->attached--; ++ } ++ ++ walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args); ++ ++ return 0; ++} ++ ++int sched_group_set_soft_domain(struct task_group *tg, long val) ++{ ++ int ret = 0; ++ ++ if (!soft_domain_enabled()) ++ return -EPERM; ++ ++ if (val < -1 || val > nr_node_ids) ++ return -EINVAL; ++ ++ mutex_lock(&soft_domain_mutex); ++ ++ /* If enable or disable is repeated, directly return. */ ++ if (!!tg->sf_ctx->policy == !!val) ++ goto out; ++ ++ if (val == 0) ++ ret = __sched_group_unset_soft_domain(tg); ++ else ++ ret = __sched_group_set_soft_domain(tg, val); ++ ++ if (!ret) ++ tg->sf_ctx->policy = val; ++ ++out: ++ mutex_unlock(&soft_domain_mutex); ++ ++ return ret; ++} ++ ++int sched_group_set_soft_domain_quota(struct task_group *tg, long val) ++{ ++ int ret = 0; ++ ++ if (!soft_domain_enabled()) ++ return -EPERM; ++ ++ mutex_lock(&soft_domain_mutex); ++ if (tg->sf_ctx->policy != 0) { ++ ret = -EINVAL; ++ goto out; ++ } else ++ tg->sf_ctx->nr_cpus = (int)val; ++ ++out: ++ mutex_unlock(&soft_domain_mutex); ++ ++ return ret; ++} ++ ++int init_soft_domain(struct task_group *tg, struct task_group *parent) ++{ ++ struct soft_domain_ctx *sf_ctx = NULL; ++ struct soft_domain_ctx *psf_ctx = NULL; ++ ++ if (!soft_domain_enabled()) ++ return 0; ++ ++ sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL); ++ if (!sf_ctx) ++ return -ENOMEM; ++ ++ mutex_lock(&soft_domain_mutex); ++ psf_ctx = parent->sf_ctx; ++ if (psf_ctx) { ++ sf_ctx->policy = psf_ctx->policy; ++ sf_ctx->nr_cpus = psf_ctx->nr_cpus; ++ cpumask_copy(to_cpumask(sf_ctx->span), to_cpumask(psf_ctx->span)); ++ } ++ ++ tg->sf_ctx = sf_ctx; ++ mutex_unlock(&soft_domain_mutex); ++ ++ return 0; ++} ++ ++void offline_soft_domain(struct task_group *tg) ++{ ++ struct soft_domain_ctx *sf_ctx = NULL; ++ struct soft_domain_ctx *psf_ctx = NULL; ++ ++ if (!soft_domain_enabled()) ++ return; ++ ++ sf_ctx = tg->sf_ctx; ++ psf_ctx = tg->parent->sf_ctx; ++ ++ if (!sf_ctx) ++ return; ++ ++ mutex_lock(&soft_domain_mutex); ++ if (sf_ctx->policy != 0) { ++ /* ++ * parent group is not set, this group set ++ * soft domain by user. ++ */ ++ if (psf_ctx == NULL || psf_ctx->policy == 0) ++ __sched_group_unset_soft_domain(tg); ++ } ++ mutex_unlock(&soft_domain_mutex); ++} ++ ++int destroy_soft_domain(struct task_group *tg) ++{ ++ if (!soft_domain_enabled()) ++ return 0; ++ ++ kfree(tg->sf_ctx); ++ ++ return 0; ++} +-- +2.20.1 + diff --git a/0005-net-venetcls-introduce-venetcls-for-network-optimiza.patch b/0005-net-venetcls-introduce-venetcls-for-network-optimiza.patch new file mode 100644 index 0000000000000000000000000000000000000000..80dcecbe1d16ff2604c11fe538363ff74c55fc2f --- /dev/null +++ b/0005-net-venetcls-introduce-venetcls-for-network-optimiza.patch @@ -0,0 +1,3008 @@ +From 8be6d6f669b88c735cfe2a94e12dea20ebb0f87f Mon Sep 17 00:00:00 2001 +From: Yue Haibing +Date: Tue, 5 Aug 2025 16:05:52 +0800 +Subject: [PATCH] net/venetcls: introduce venetcls for network optimization + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/ICBFCS +CVE: NA + +-------------------------------- + +This introduces a kind of network optimization method named venetcls. It +can configure the ntuple rule, and bind interrupt to the netdev queue +automatically. + +Signed-off-by: Yue Haibing +Signed-off-by: Wang Liang +Signed-off-by: Liu Jian +Signed-off-by: yuelg +--- + include/linux/netdevice.h | 3 + + include/linux/venetcls.h | 97 +++ + kernel/irq/irqdesc.c | 2 +- + net/Kconfig | 1 + + net/Makefile | 1 + + net/core/dev.c | 22 + + net/ipv4/af_inet.c | 6 + + net/ipv4/tcp.c | 9 + + net/venetcls/Kconfig | 7 + + net/venetcls/Makefile | 7 + + net/venetcls/asmdefs.h | 61 ++ + net/venetcls/memcpy-sve.S | 157 +++++ + net/venetcls/venetcls.h | 187 ++++++ + net/venetcls/venetcls_flow.c | 491 +++++++++++++++ + net/venetcls/venetcls_main.c | 1086 ++++++++++++++++++++++++++++++++ + net/venetcls/venetcls_ntuple.c | 643 +++++++++++++++++++ + 16 files changed, 2779 insertions(+), 1 deletion(-) + create mode 100644 include/linux/venetcls.h + create mode 100644 net/venetcls/Kconfig + create mode 100644 net/venetcls/Makefile + create mode 100644 net/venetcls/asmdefs.h + create mode 100644 net/venetcls/memcpy-sve.S + create mode 100644 net/venetcls/venetcls.h + create mode 100644 net/venetcls/venetcls_flow.c + create mode 100644 net/venetcls/venetcls_main.c + create mode 100644 net/venetcls/venetcls_ntuple.c + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index cc1f14f3c..e5f876cec 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -766,6 +766,9 @@ struct netdev_rx_queue { + struct xsk_buff_pool *pool; + #endif + struct file __rcu *dmabuf_pages; ++#if IS_ENABLED(CONFIG_VENETCLS) ++ void __rcu *vecls_ftb; ++#endif + } ____cacheline_aligned_in_smp; + + struct page * +diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h +new file mode 100644 +index 000000000..792991155 +--- /dev/null ++++ b/include/linux/venetcls.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _LINUX_VENETCLS_H ++#define _LINUX_VENETCLS_H ++ ++struct vecls_hook_ops { ++ void (*vecls_cfg_rxcls)(struct sock *sk, int is_del); ++ void (*vecls_flow_update)(struct sock *sk); ++ void (*vecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); ++ bool (*vecls_timeout)(struct net_device *dev, u16 rxq_index, ++ u32 flow_id, u16 filter_id); ++}; ++ ++typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); ++extern const struct vecls_hook_ops __rcu *vecls_ops; ++ ++static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del) ++{ ++ const struct vecls_hook_ops *ops; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(vecls_ops); ++ if (ops && ops->vecls_cfg_rxcls) ++ ops->vecls_cfg_rxcls(sk, is_del); ++ rcu_read_unlock(); ++} ++ ++static inline void venetcls_flow_update(struct sock *sk) ++{ ++ const struct vecls_hook_ops *ops; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(vecls_ops); ++ if (ops && ops->vecls_flow_update) ++ ops->vecls_flow_update(sk); ++ rcu_read_unlock(); ++} ++ ++static inline bool ++venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int* ret) ++{ ++ const struct vecls_hook_ops *ops; ++ int cpu, last_qtail; ++ bool result = false; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(vecls_ops); ++ if (ops && ops->vecls_set_cpu) { ++ ops->vecls_set_cpu(skb, &cpu, &last_qtail); ++ if (cpu >= 0) { ++ *ret = enq_func(skb, cpu, &last_qtail); ++ result = true; ++ } ++ } ++ rcu_read_unlock(); ++ return result; ++} ++ ++static inline void ++venetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) ++{ ++ const struct vecls_hook_ops *ops; ++ struct sk_buff *skb, *next; ++ int cpu, last_qtail; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(vecls_ops); ++ if (ops && ops->vecls_set_cpu) { ++ list_for_each_entry_safe(skb, next, head, list) { ++ ops->vecls_set_cpu(skb, &cpu, &last_qtail); ++ if (cpu >= 0) { ++ skb_list_del_init(skb); ++ enq_func(skb, cpu, &last_qtail); ++ } ++ } ++ } ++ rcu_read_unlock(); ++ return; ++} ++ ++static inline bool venetcls_may_expire_flow(struct net_device *dev, ++ u16 rxq_index, u32 flow_id, ++ u16 filter_id, bool *expire) ++{ ++ const struct vecls_hook_ops *ops; ++ bool ret = false; ++ ++ rcu_read_lock(); ++ ops = rcu_dereference(vecls_ops); ++ if (ops && ops->vecls_timeout) { ++ *expire = ops->vecls_timeout(dev, rxq_index, flow_id, filter_id); ++ ret = true; ++ } ++ rcu_read_unlock(); ++ return ret; ++} ++ ++#endif /* _LINUX_VENETCLS_H */ +diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c +index 8202d4a99..eb8641e22 100644 +--- a/kernel/irq/irqdesc.c ++++ b/kernel/irq/irqdesc.c +@@ -366,7 +366,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) + { + return radix_tree_lookup(&irq_desc_tree, irq); + } +-#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE ++#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_VENETCLS) + EXPORT_SYMBOL_GPL(irq_to_desc); + #endif + +diff --git a/net/Kconfig b/net/Kconfig +index dc8451e75..2b68c0f86 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -72,6 +72,7 @@ source "net/xfrm/Kconfig" + source "net/iucv/Kconfig" + source "net/smc/Kconfig" + source "net/xdp/Kconfig" ++source "net/venetcls/Kconfig" + + config INET + bool "TCP/IP networking" +diff --git a/net/Makefile b/net/Makefile +index 6a62e5b27..a2cb1281e 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -78,3 +78,4 @@ obj-$(CONFIG_NET_NCSI) += ncsi/ + obj-$(CONFIG_XDP_SOCKETS) += xdp/ + obj-$(CONFIG_MPTCP) += mptcp/ + obj-$(CONFIG_MCTP) += mctp/ ++obj-$(CONFIG_VENETCLS) += venetcls/ +diff --git a/net/core/dev.c b/net/core/dev.c +index f628494a1..1cd6b5413 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -160,6 +160,12 @@ + /* This should be increased if a protocol with a bigger head is added. */ + #define GRO_MAX_HEAD (MAX_HEADER + 128) + ++#if IS_ENABLED(CONFIG_VENETCLS) ++#include ++const struct vecls_hook_ops __rcu *vecls_ops __read_mostly; ++EXPORT_SYMBOL_GPL(vecls_ops); ++#endif ++ + static DEFINE_SPINLOCK(ptype_lock); + static DEFINE_SPINLOCK(offload_lock); + struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +@@ -4770,6 +4776,10 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + bool expire = true; + unsigned int cpu; + ++#if IS_ENABLED(CONFIG_VENETCLS) ++ if (venetcls_may_expire_flow(dev, rxq_index, flow_id, filter_id, &expire)) ++ return expire; ++#endif + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { +@@ -5881,6 +5891,12 @@ static int netif_receive_skb_internal(struct sk_buff *skb) + return ret; + } + } ++#endif ++#if IS_ENABLED(CONFIG_VENETCLS) ++ if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { ++ rcu_read_unlock(); ++ return ret; ++ } + #endif + ret = __netif_receive_skb(skb); + rcu_read_unlock(); +@@ -5915,6 +5931,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) + } + } + } ++#endif ++#if IS_ENABLED(CONFIG_VENETCLS) ++ venetcls_skblist_set_cpu(head, enqueue_to_backlog); + #endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +@@ -10271,6 +10290,9 @@ int __netdev_update_features(struct net_device *dev) + + return err < 0 ? 0 : 1; + } ++#if IS_ENABLED(CONFIG_VENETCLS) ++EXPORT_SYMBOL(__netdev_update_features); ++#endif + + static int netdev_do_alloc_pcpu_stats(struct net_device *dev) + { +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 5dc1955e3..06b917182 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -120,6 +120,9 @@ + #include + + #include ++#if IS_ENABLED(CONFIG_VENETCLS) ++#include ++#endif + + /* The inetsw table contains everything that inet_create needs to + * build a new socket. +@@ -229,6 +232,9 @@ int inet_listen(struct socket *sock, int backlog) + if (err) + goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); ++#if IS_ENABLED(CONFIG_VENETCLS) ++ venetcls_cfg_rxcls(sk, 0); ++#endif + } + err = 0; + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index e8b7f0c5d..cc84873ce 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -281,6 +281,9 @@ + #include + #include + #include ++#if IS_ENABLED(CONFIG_VENETCLS) ++#include ++#endif + + /* Track pending CMSGs. */ + enum { +@@ -2940,6 +2943,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + ++#if IS_ENABLED(CONFIG_VENETCLS) ++ venetcls_flow_update(sk); ++#endif + if (sk_can_busy_loop(sk) && + skb_queue_empty_lockless(&sk->sk_receive_queue) && + sk->sk_state == TCP_ESTABLISHED) +@@ -3300,6 +3306,9 @@ void __tcp_close(struct sock *sk, long timeout) + void tcp_close(struct sock *sk, long timeout) + { + lock_sock(sk); ++#if IS_ENABLED(CONFIG_VENETCLS) ++ venetcls_cfg_rxcls(sk, 1); ++#endif + __tcp_close(sk, timeout); + release_sock(sk); + sock_put(sk); +diff --git a/net/venetcls/Kconfig b/net/venetcls/Kconfig +new file mode 100644 +index 000000000..cd4d7c8f9 +--- /dev/null ++++ b/net/venetcls/Kconfig +@@ -0,0 +1,7 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++config VENETCLS ++ tristate "Network classification" ++ default n ++ help ++ Allow to bind NIC interrupts and configure ntuple rules to ++ achieve sock numa affinity +diff --git a/net/venetcls/Makefile b/net/venetcls/Makefile +new file mode 100644 +index 000000000..639a81d7d +--- /dev/null ++++ b/net/venetcls/Makefile +@@ -0,0 +1,7 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++ ++obj-$(CONFIG_VENETCLS) = venetcls.o ++venetcls-y := venetcls_main.o venetcls_ntuple.o venetcls_flow.o ++ifeq ($(CONFIG_ARM64_SVE),y) ++venetcls-y += memcpy-sve.o ++endif +diff --git a/net/venetcls/asmdefs.h b/net/venetcls/asmdefs.h +new file mode 100644 +index 000000000..8138a94c1 +--- /dev/null ++++ b/net/venetcls/asmdefs.h +@@ -0,0 +1,61 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _ASMDEFS_H ++#define _ASMDEFS_H ++ ++/* Branch Target Identitication support. */ ++#define BTI_C hint 34 ++#define BTI_J hint 36 ++/* Return address signing support (pac-ret). */ ++#define PACIASP hint 25; .cfi_window_save ++#define AUTIASP hint 29; .cfi_window_save ++ ++/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ ++#define FEATURE_1_AND 0xc0000000 ++#define FEATURE_1_BTI 1 ++#define FEATURE_1_PAC 2 ++ ++/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ ++#define GNU_PROPERTY(type, value) \ ++ .section .note.gnu.property, "a"; \ ++ .p2align 3; \ ++ .word 4; \ ++ .word 16; \ ++ .word 5; \ ++ .asciz "GNU"; \ ++ .word type; \ ++ .word 4; \ ++ .word value; \ ++ .word 0; \ ++ .text ++ ++#ifndef WANT_GNU_PROPERTY ++#define WANT_GNU_PROPERTY 1 ++#endif ++ ++#if WANT_GNU_PROPERTY ++/* Add property note with supported features to all asm files. */ ++GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) ++#endif ++ ++#define ENTRY_ALIGN(name, alignment) \ ++ .global name; \ ++ .type name, %function; \ ++ .align alignment; \ ++name: \ ++ .cfi_startproc; \ ++ BTI_C; ++ ++#define ENTRY(name) ENTRY_ALIGN(name, 6) ++ ++#define ENTRY_ALIAS(name) \ ++ .global name; \ ++ .type name, %function; \ ++ name: ++ ++#define END(name) \ ++ .cfi_endproc; \ ++ .size name, .-name; ++ ++#define L(l) .L ## l ++ ++#endif +diff --git a/net/venetcls/memcpy-sve.S b/net/venetcls/memcpy-sve.S +new file mode 100644 +index 000000000..106e4c302 +--- /dev/null ++++ b/net/venetcls/memcpy-sve.S +@@ -0,0 +1,157 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#include "asmdefs.h" ++ ++.arch armv8-a+sve ++ ++#define dstin x0 ++#define src x1 ++#define count x2 ++#define dst x3 ++#define srcend x4 ++#define dstend x5 ++#define tmp1 x6 ++#define vlen x6 ++ ++#define A_q q0 ++#define B_q q1 ++#define C_q q2 ++#define D_q q3 ++#define E_q q4 ++#define F_q q5 ++#define G_q q6 ++#define H_q q7 ++ ++/* This implementation handles overlaps and supports both memcpy and memmove ++ from a single entry point. It uses unaligned accesses and branchless ++ sequences to keep the code small, simple and improve performance. ++ SVE vectors are used to speedup small copies. ++ ++ Copies are split into 3 main cases: small copies of up to 32 bytes, medium ++ copies of up to 128 bytes, and large copies. The overhead of the overlap ++ check is negligible since it is only required for large copies. ++ ++ Large copies use a software pipelined loop processing 64 bytes per iteration. ++ The source pointer is 16-byte aligned to minimize unaligned accesses. ++ The loop tail is handled by always copying 64 bytes from the end. ++*/ ++ ++ENTRY_ALIAS (__memmove_aarch64_sve) ++ENTRY (__memcpy_aarch64_sve) ++ cmp count, 128 ++ b.hi L(copy_long) ++ cntb vlen ++ cmp count, vlen, lsl 1 ++ b.hi L(copy32_128) ++ ++ whilelo p0.b, xzr, count ++ whilelo p1.b, vlen, count ++ ld1b z0.b, p0/z, [src, 0, mul vl] ++ ld1b z1.b, p1/z, [src, 1, mul vl] ++ st1b z0.b, p0, [dstin, 0, mul vl] ++ st1b z1.b, p1, [dstin, 1, mul vl] ++ ret ++ ++ /* Medium copies: 33..128 bytes. */ ++L(copy32_128): ++ add srcend, src, count ++ add dstend, dstin, count ++ ldp A_q, B_q, [src] ++ ldp C_q, D_q, [srcend, -32] ++ cmp count, 64 ++ b.hi L(copy128) ++ stp A_q, B_q, [dstin] ++ stp C_q, D_q, [dstend, -32] ++ ret ++ ++ /* Copy 65..128 bytes. */ ++L(copy128): ++ ldp E_q, F_q, [src, 32] ++ cmp count, 96 ++ b.ls L(copy96) ++ ldp G_q, H_q, [srcend, -64] ++ stp G_q, H_q, [dstend, -64] ++L(copy96): ++ stp A_q, B_q, [dstin] ++ stp E_q, F_q, [dstin, 32] ++ stp C_q, D_q, [dstend, -32] ++ ret ++ ++ /* Copy more than 128 bytes. */ ++L(copy_long): ++ add srcend, src, count ++ add dstend, dstin, count ++ ++ /* Use backwards copy if there is an overlap. */ ++ sub tmp1, dstin, src ++ cmp tmp1, count ++ b.lo L(copy_long_backwards) ++ ++ /* Copy 16 bytes and then align src to 16-byte alignment. */ ++ ldr D_q, [src] ++ and tmp1, src, 15 ++ bic src, src, 15 ++ sub dst, dstin, tmp1 ++ add count, count, tmp1 /* Count is now 16 too large. */ ++ ldp A_q, B_q, [src, 16] ++ str D_q, [dstin] ++ ldp C_q, D_q, [src, 48] ++ subs count, count, 128 + 16 /* Test and readjust count. */ ++ b.ls L(copy64_from_end) ++L(loop64): ++ stp A_q, B_q, [dst, 16] ++ ldp A_q, B_q, [src, 80] ++ stp C_q, D_q, [dst, 48] ++ ldp C_q, D_q, [src, 112] ++ add src, src, 64 ++ add dst, dst, 64 ++ subs count, count, 64 ++ b.hi L(loop64) ++ ++ /* Write the last iteration and copy 64 bytes from the end. */ ++L(copy64_from_end): ++ ldp E_q, F_q, [srcend, -64] ++ stp A_q, B_q, [dst, 16] ++ ldp A_q, B_q, [srcend, -32] ++ stp C_q, D_q, [dst, 48] ++ stp E_q, F_q, [dstend, -64] ++ stp A_q, B_q, [dstend, -32] ++ ret ++ ++ /* Large backwards copy for overlapping copies. ++ Copy 16 bytes and then align srcend to 16-byte alignment. */ ++L(copy_long_backwards): ++ cbz tmp1, L(return) ++ ldr D_q, [srcend, -16] ++ and tmp1, srcend, 15 ++ bic srcend, srcend, 15 ++ sub count, count, tmp1 ++ ldp A_q, B_q, [srcend, -32] ++ str D_q, [dstend, -16] ++ ldp C_q, D_q, [srcend, -64] ++ sub dstend, dstend, tmp1 ++ subs count, count, 128 ++ b.ls L(copy64_from_start) ++ ++L(loop64_backwards): ++ str B_q, [dstend, -16] ++ str A_q, [dstend, -32] ++ ldp A_q, B_q, [srcend, -96] ++ str D_q, [dstend, -48] ++ str C_q, [dstend, -64]! ++ ldp C_q, D_q, [srcend, -128] ++ sub srcend, srcend, 64 ++ subs count, count, 64 ++ b.hi L(loop64_backwards) ++ ++ /* Write the last iteration and copy 64 bytes from the start. */ ++L(copy64_from_start): ++ ldp E_q, F_q, [src, 32] ++ stp A_q, B_q, [dstend, -32] ++ ldp A_q, B_q, [src] ++ stp C_q, D_q, [dstend, -64] ++ stp E_q, F_q, [dstin, 32] ++ stp A_q, B_q, [dstin] ++L(return): ++ ret ++ ++END (__memcpy_aarch64_sve) +diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h +new file mode 100644 +index 000000000..9e8fb0e0a +--- /dev/null ++++ b/net/venetcls/venetcls.h +@@ -0,0 +1,187 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _NET_VENETCLS_H ++#define _NET_VENETCLS_H ++#include ++#include ++#include ++ ++#define VECLS_MAX_NETDEV_NUM 8 ++#define VECLS_MAX_RXQ_NUM_PER_DEV 256 ++#define VECLS_MAX_CPU_NUM 1024 ++ ++#define VECLS_TIMEOUT (5 * HZ) ++#define VECLS_NO_FILTER 0xffff ++#define VECLS_NO_CPU 0xffff ++ ++struct vecls_netdev_queue_info { ++ int irq; ++ int affinity_cpu; ++}; ++ ++struct vecls_netdev_info { ++ char dev_name[IFNAMSIZ]; ++ struct net_device *netdev; ++ int rxq_num; ++ struct vecls_netdev_queue_info rxq[VECLS_MAX_RXQ_NUM_PER_DEV]; ++ int old_filter_state; ++}; ++ ++struct vecls_rxq { ++ int rxq_id; ++ int status; ++}; ++ ++struct vecls_numa_clusterinfo { ++ int cluster_id; ++ int cur_freeidx; ++ struct vecls_rxq rxqs[VECLS_MAX_RXQ_NUM_PER_DEV]; ++}; ++ ++struct vecls_numa_bound_dev_info { ++ DECLARE_BITMAP(bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV); ++ struct vecls_numa_clusterinfo *cluster_info; ++}; ++ ++struct vecls_numa_info { ++ DECLARE_BITMAP(avail_cpus, VECLS_MAX_CPU_NUM); ++ struct vecls_numa_bound_dev_info bound_dev[VECLS_MAX_NETDEV_NUM]; ++}; ++ ++struct cmd_context { ++ char netdev[IFNAMSIZ]; ++ u32 dip4; ++ u16 dport; ++ u16 action; ++ u32 ruleid; ++ u32 del_ruleid; ++ int ret_loc; ++}; ++ ++#define VECLS_SK_RULE_HASHSIZE 256 ++#define VECLS_SK_RULE_HASHMASK (VECLS_SK_RULE_HASHSIZE - 1) ++ ++struct vecls_sk_rule_list { ++ struct hlist_head hash[VECLS_SK_RULE_HASHSIZE]; ++ /* Mutex to synchronize access to ntuple rule locking */ ++ struct mutex mutex; ++}; ++ ++struct vecls_sk_rule { ++ struct hlist_node node; ++ int devid; ++ void *sk; ++ int dip4; ++ int dport; ++ int action; ++ int ruleid; ++ int nid; ++}; ++ ++struct vecls_sk_entry { ++ struct hlist_node node; ++ void *sk; ++ u32 sk_rule_hash; ++}; ++ ++struct vecls_dev_flow { ++ unsigned short cpu; ++ unsigned short filter; ++ unsigned long timeout; ++ int isvalid; ++}; ++ ++struct vecls_dev_flow_table { ++ unsigned int mask; ++ struct rcu_head rcu; ++ struct vecls_dev_flow flows[]; ++}; ++ ++struct vecls_sock_flow_table { ++ u32 mask; ++ u32 ents[] ____cacheline_aligned_in_smp; ++}; ++ ++#define VECLS_DEV_FLOW_TABLE_NUM 0x1000 ++#define VECLS_SOCK_FLOW_TABLE_NUM 0x100000 ++#define VECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct vecls_dev_flow_table) + \ ++ ((_num) * sizeof(struct vecls_dev_flow))) ++#define VECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct vecls_sock_flow_table, ents[_num])) ++ ++#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ++ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) ++#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ ++ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ ++ NETIF_F_RXHASH) ++ ++struct rmgr_ctrl { ++ int driver_select; ++ unsigned long *slot; ++ __u32 n_rules; ++ __u32 size; ++}; ++ ++struct cfg_param { ++ struct work_struct work; ++ struct cmd_context ctx; ++ struct sock *sk; ++ bool is_del; ++ int nid; ++ int cpu; ++}; ++ ++extern int match_ip_flag; ++extern int debug; ++extern int vecls_netdev_num; ++extern int vecls_numa_num; ++ ++#define vecls_debug(fmt, ...) \ ++ do { \ ++ if (debug) \ ++ trace_printk(fmt, ## __VA_ARGS__); \ ++ } while (0) ++ ++#define vecls_error(fmt, ...) \ ++ do { \ ++ pr_err_ratelimited("venetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \ ++ trace_printk(fmt, ## __VA_ARGS__); \ ++ } while (0) ++ ++struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index); ++ ++#define for_each_vecls_netdev(devid, vecls_dev) \ ++ for (devid = 0, vecls_dev = get_vecls_netdev_info(devid); \ ++ (devid < vecls_netdev_num) && vecls_dev; \ ++ devid++, vecls_dev = get_vecls_netdev_info(devid)) ++ ++struct vecls_numa_info *get_vecls_numa_info(unsigned int nid); ++ ++#define for_each_vecls_numa(nid, numa_info) \ ++ for (nid = 0, numa_info = get_vecls_numa_info(nid); \ ++ (nid < vecls_numa_num) && numa_info; \ ++ nid++, numa_info = get_vecls_numa_info(nid)) ++ ++#ifdef CONFIG_ARM64_SVE ++void *__memcpy_aarch64_sve(void *, const void *, size_t); ++#define memcpy_r(dst, src, len) \ ++ do { \ ++ if (system_supports_sve()) \ ++ __memcpy_aarch64_sve(dst, src, len); \ ++ else \ ++ memcpy(dst, src, len); \ ++ } while (0) ++#else ++#define memcpy_r(dst, src, len) memcpy(dst, src, len) ++#endif ++ ++int check_appname(char *task_name); ++int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); ++int alloc_rxq_id(int nid, int cpu, int devid); ++void free_rxq_id(int nid, int devid, int rxq_id); ++int vecls_ntuple_res_init(void); ++void vecls_ntuple_res_clean(void); ++int venetcls_ntuple_status(struct seq_file *seq, void *v); ++int vecls_flow_res_init(void); ++void vecls_flow_res_clean(void); ++int venetcls_flow_status(struct seq_file *seq, void *v); ++ ++#endif /* _NET_VENETCLS_H */ +diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c +new file mode 100644 +index 000000000..f2d7e42ce +--- /dev/null ++++ b/net/venetcls/venetcls_flow.c +@@ -0,0 +1,491 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "venetcls.h" ++ ++static u32 vecls_cpu_mask; ++static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table; ++static DEFINE_MUTEX(vecls_sock_flow_mutex); ++static DEFINE_SPINLOCK(vecls_dev_flow_lock); ++ ++bool is_vecls_config_netdev(const char *name) ++{ ++ struct vecls_netdev_info *netdev_info; ++ int netdev_loop; ++ ++ for_each_vecls_netdev(netdev_loop, netdev_info) ++ if (strcmp(netdev_info->dev_name, name) == 0) ++ return true; ++ ++ return false; ++} ++ ++static bool _vecls_timeout(struct net_device *dev, u16 rxq_index, ++ u32 flow_id, u16 filter_id) ++{ ++ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; ++ struct vecls_dev_flow_table *flow_table; ++ struct vecls_dev_flow *rflow; ++ bool expire = true; ++ unsigned int cpu; ++ ++ rcu_read_lock(); ++ flow_table = rcu_dereference(rxqueue->vecls_ftb); ++ if (flow_table && flow_id <= flow_table->mask) { ++ rflow = &flow_table->flows[flow_id]; ++ cpu = READ_ONCE(rflow->cpu); ++ if (rflow->filter == filter_id && cpu < nr_cpu_ids) { ++ if (time_before(jiffies, rflow->timeout + VECLS_TIMEOUT)) { ++ expire = false; ++ } else { ++ rflow->isvalid = 0; ++ WRITE_ONCE(rflow->cpu, VECLS_NO_CPU); ++ } ++ } ++ } ++ rcu_read_unlock(); ++ if (expire) ++ vecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__, ++ dev->name, rxq_index, flow_id, filter_id, expire); ++ return expire; ++} ++ ++static void _vecls_flow_update(struct sock *sk) ++{ ++ struct vecls_sock_flow_table *tb; ++ unsigned int hash, index; ++ u32 val; ++ u32 cpu = raw_smp_processor_id(); ++ ++ if (sk->sk_state != TCP_ESTABLISHED) ++ return; ++ ++ if (check_appname(current->comm)) ++ return; ++ ++ rcu_read_lock(); ++ tb = rcu_dereference(vecls_sock_flow_table); ++ hash = READ_ONCE(sk->sk_rxhash); ++ if (tb && hash) { ++ index = hash & tb->mask; ++ val = hash & ~vecls_cpu_mask; ++ val |= cpu; ++ ++ if (READ_ONCE(tb->ents[index]) != val) { ++ WRITE_ONCE(tb->ents[index], val); ++ } ++ } ++ rcu_read_unlock(); ++} ++ ++static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) ++{ ++ struct vecls_netdev_info *netdev_info; ++ int netdev_loop; ++ u32 hash, index; ++ struct vecls_numa_info *numa_info; ++ struct vecls_numa_bound_dev_info *bound_dev = NULL; ++ int rxq_id, rxq_num, i; ++ ++ numa_info = get_vecls_numa_info(nid); ++ if (!numa_info) ++ return -1; ++ ++ for_each_vecls_netdev(netdev_loop, netdev_info) { ++ if (strcmp(netdev_info->dev_name, dev->name) == 0) { ++ bound_dev = &numa_info->bound_dev[netdev_loop]; ++ break; ++ } ++ } ++ ++ if (!bound_dev) ++ return -1; ++ rxq_num = bitmap_weight(bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV); ++ if (rxq_num == 0) ++ return -1; ++ ++ hash = skb_get_hash(skb); ++ index = hash % rxq_num; ++ ++ i = 0; ++ for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV) ++ if (index == i++) ++ return rxq_id; ++ ++ return -1; ++} ++ ++static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb, ++ struct vecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) ++{ ++ struct netdev_rx_queue *rxqueue; ++ struct vecls_dev_flow_table *dtb; ++ struct vecls_dev_flow *rflow; ++ u32 flow_id, hash; ++ u16 rxq_index; ++ int rc; ++ ++ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || ++ !(dev->features & NETIF_F_NTUPLE)) ++ return; ++ ++ rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); ++ if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) { ++ vecls_debug("%s skb:%p, old_rxq:%d, next_cpu:%d new_rxq:%d\n", ++ __func__, skb, old_rxq_id, next_cpu, rxq_index); ++ return; ++ } ++ ++ rxqueue = dev->_rx + rxq_index; ++ dtb = rcu_dereference(rxqueue->vecls_ftb); ++ if (!dtb) ++ return; ++ ++ hash = skb_get_hash(skb); ++ flow_id = hash & dtb->mask; ++ rflow = &dtb->flows[flow_id]; ++ ++ if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(next_cpu)) { ++ rflow->timeout = jiffies; ++ return; ++ } ++ ++ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); ++ if (rc < 0) { ++ vecls_debug("%s ndo_rx_flow_steer skb:%p rxq:%d hash:0x%x flow_id:%u" ++ "old_rxq:%d rflow->cpu:%d rflow->isvalid:%d next_cpu:%d rc:%d\n", ++ __func__, skb, rxq_index, hash, flow_id, old_rxq_id, rflow->cpu, ++ rflow->isvalid, next_cpu, rc); ++ return; ++ } ++ ++ rflow->filter = rc; ++ rflow->isvalid = 1; ++ rflow->timeout = jiffies; ++ if (old_rflow->filter == rflow->filter) ++ old_rflow->filter = VECLS_NO_FILTER; ++ rflow->cpu = next_cpu; ++} ++ ++static int get_cpu_in_numa(int tcpu, u32 hash) ++{ ++ const struct cpumask *mask; ++ int nr_cpus, cpu, index; ++ ++ mask = cpumask_of_node(cpu_to_node(tcpu)); ++ nr_cpus = cpumask_weight(mask); ++ if (nr_cpus == 0) ++ return -1; ++ ++ index = reciprocal_scale(hash, nr_cpus); ++ if (index < 0) ++ return -1; ++ ++ cpu = cpumask_first(mask); ++ while (--nr_cpus > 0) { ++ if (index == 0) ++ break; ++ cpu = cpumask_next(cpu, mask); ++ index--; ++ } ++ return cpu; ++} ++ ++static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, ++ struct vecls_sock_flow_table *tb, struct vecls_dev_flow_table *dtb, ++ int old_rxq_id, int *rcpu, int *last_qtail) ++{ ++ struct vecls_dev_flow *rflow; ++ u32 last_recv_cpu, hash, val; ++ int tcpu = 0, newcpu; ++ u32 cpu = raw_smp_processor_id(); ++ ++ skb_reset_network_header(skb); ++ hash = skb_get_hash(skb); ++ if (!hash) ++ return; ++ ++ val = READ_ONCE(tb->ents[hash & tb->mask]); ++ last_recv_cpu = val & vecls_cpu_mask; ++ rflow = &dtb->flows[hash & dtb->mask]; ++ tcpu = rflow->cpu; ++ ++ if ((val ^ hash) & ~vecls_cpu_mask) ++ return; ++ ++ newcpu = get_cpu_in_numa(last_recv_cpu, hash); ++ if (newcpu >= 0) ++ *rcpu = newcpu; ++ else ++ newcpu = last_recv_cpu; ++ ++ if (cpu_to_node(cpu) == cpu_to_node(newcpu)) ++ return; ++ ++ if (tcpu >= nr_cpu_ids) ++ set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); ++} ++ ++static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) ++{ ++ struct net_device *ndev = skb->dev; ++ struct vecls_sock_flow_table *stb; ++ struct vecls_dev_flow_table *dtb; ++ struct netdev_rx_queue *rxqueue; ++ int rxq_id = -1; ++ ++ *cpu = -1; ++ last_qtail = 0;//unused ++ if (!ndev) ++ return; ++ ++ if (!is_vecls_config_netdev(ndev->name)) ++ return; ++ ++ rxqueue = ndev->_rx; ++ if (skb_rx_queue_recorded(skb)) { ++ rxq_id = skb_get_rx_queue(skb); ++ if (rxq_id >= ndev->real_num_rx_queues) { ++ vecls_debug("%s ndev:%s rxq:%d real_num:%d\n", __func__, ++ ndev->name, rxq_id, ndev->real_num_rx_queues); ++ return; ++ } ++ rxqueue += rxq_id; ++ } ++ ++ if (rxq_id < 0) ++ return; ++ ++ rcu_read_lock(); ++ stb = rcu_dereference(vecls_sock_flow_table); ++ dtb = rcu_dereference(rxqueue->vecls_ftb); ++ if (stb && dtb) ++ __vecls_set_cpu(skb, ndev, stb, dtb, rxq_id, cpu, last_qtail); ++ rcu_read_unlock(); ++} ++ ++static void vecls_dev_flow_table_free(struct rcu_head *rcu) ++{ ++ struct vecls_dev_flow_table *table = container_of(rcu, ++ struct vecls_dev_flow_table, rcu); ++ vfree(table); ++} ++ ++static void vecls_dev_flow_table_cleanup(struct net_device *netdev, int qid) ++{ ++ struct vecls_dev_flow_table *dtb; ++ struct netdev_rx_queue *queue; ++ int i; ++ ++ spin_lock(&vecls_dev_flow_lock); ++ for (i = 0; i < qid; i++) { ++ queue = netdev->_rx + i; ++ dtb = rcu_dereference_protected(queue->vecls_ftb, ++ lockdep_is_held(&vecls_dev_flow_lock)); ++ rcu_assign_pointer(queue->vecls_ftb, NULL); ++ } ++ spin_unlock(&vecls_dev_flow_lock); ++ call_rcu(&dtb->rcu, vecls_dev_flow_table_free); ++} ++ ++static int vecls_dev_flow_table_release(void) ++{ ++ struct vecls_netdev_info *netdev_info; ++ int netdev_loop; ++ struct net_device *netdev; ++ ++ for_each_vecls_netdev(netdev_loop, netdev_info) { ++ netdev = netdev_info->netdev; ++ if (!netdev) ++ continue; ++ vecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues); ++ } ++ ++ return 0; ++} ++ ++static int _vecls_dev_flow_table_init(struct net_device *netdev) ++{ ++ struct vecls_dev_flow_table *table; ++ int size = VECLS_DEV_FLOW_TABLE_NUM; ++ struct netdev_rx_queue *queue; ++ int i, j, ret = 0; ++ ++ size = roundup_pow_of_two(size); ++ vecls_debug("%s dev:%s num_rx_queues:%d mask:0x%x\n", ++ __func__, netdev->name, netdev->num_rx_queues, size - 1); ++ ++ for (i = 0; i < netdev->num_rx_queues; i++) { ++ table = vmalloc(VECLS_DEV_FLOW_TABLE_SIZE(size)); ++ if (!table) { ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ table->mask = size - 1; ++ for (j = 0; j < size; j++) { ++ table->flows[j].cpu = VECLS_NO_CPU; ++ table->flows[j].isvalid = 0; ++ } ++ ++ queue = netdev->_rx + i; ++ ++ spin_lock(&vecls_dev_flow_lock); ++ rcu_assign_pointer(queue->vecls_ftb, table); ++ spin_unlock(&vecls_dev_flow_lock); ++ } ++ return ret; ++fail: ++ vecls_dev_flow_table_cleanup(netdev, i); ++ return ret; ++} ++ ++static int vecls_dev_flow_table_init(void) ++{ ++ struct vecls_netdev_info *netdev_info; ++ int netdev_loop; ++ struct net_device *ndev; ++ int i, err; ++ ++ for_each_vecls_netdev(netdev_loop, netdev_info) { ++ ndev = netdev_info->netdev; ++ if (!ndev) ++ continue; ++ err = _vecls_dev_flow_table_init(ndev); ++ if (err) ++ goto out; ++ } ++ ++ return 0; ++out: ++ for (i = 0; i < netdev_loop; i++) { ++ netdev_info = get_vecls_netdev_info(i); ++ ndev = netdev_info->netdev; ++ if (!ndev) ++ continue; ++ vecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues); ++ } ++ return err; ++} ++ ++static const struct vecls_hook_ops vecls_flow_ops = { ++ .vecls_flow_update = _vecls_flow_update, ++ .vecls_set_cpu = _vecls_set_cpu, ++ .vecls_timeout = _vecls_timeout, ++ .vecls_cfg_rxcls = NULL, ++}; ++ ++static int vecls_sock_flow_table_release(void) ++{ ++ struct vecls_sock_flow_table *tb; ++ ++ mutex_lock(&vecls_sock_flow_mutex); ++ tb = rcu_dereference_protected(vecls_sock_flow_table, ++ lockdep_is_held(&vecls_sock_flow_mutex)); ++ if (tb) ++ rcu_assign_pointer(vecls_sock_flow_table, NULL); ++ mutex_unlock(&vecls_sock_flow_mutex); ++ synchronize_rcu(); ++ vfree(tb); ++ ++ return 0; ++} ++ ++int venetcls_flow_status(struct seq_file *seq, void *v) ++{ ++ struct vecls_netdev_info *netdev_info; ++ struct vecls_dev_flow_table *dtb; ++ struct netdev_rx_queue *queue; ++ struct net_device *netdev; ++ int netdev_loop, i, j; ++ ++ seq_printf(seq, "%-16s %-6s %-12s %-12s %-12s\n", ++ "Interface", "rxq", "flowCPU", "filterId", "timeout"); ++ spin_lock(&vecls_dev_flow_lock); ++ for_each_vecls_netdev(netdev_loop, netdev_info) { ++ netdev = netdev_info->netdev; ++ if (!netdev) ++ continue; ++ for (i = 0; i < netdev->num_rx_queues; i++) { ++ queue = netdev->_rx + i; ++ dtb = rcu_dereference_protected(queue->vecls_ftb, ++ lockdep_is_held(&vecls_dev_flow_lock)); ++ if (!dtb) ++ continue; ++ for (j = 0; j < VECLS_DEV_FLOW_TABLE_NUM; j++) { ++ if (dtb->flows[j].cpu == VECLS_NO_CPU) ++ continue; ++ if (dtb->flows[j].isvalid == 0) ++ continue; ++ if (time_before(jiffies, dtb->flows[j].timeout + VECLS_TIMEOUT)) { ++ seq_printf(seq, "%-16s %-6d %-12d %-12d %-12u\n", netdev_info->dev_name, ++ i, dtb->flows[j].cpu, dtb->flows[j].filter, ++ jiffies_to_msecs(dtb->flows[j].timeout + VECLS_TIMEOUT - jiffies)); ++ } ++ } ++ } ++ } ++ spin_unlock(&vecls_dev_flow_lock); ++ ++ return 0; ++} ++ ++static int vecls_sock_flow_table_init(void) ++{ ++ struct vecls_sock_flow_table *table; ++ int size = VECLS_SOCK_FLOW_TABLE_NUM; ++ int i; ++ ++ size = roundup_pow_of_two(size); ++ table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size)); ++ if (!table) ++ return -ENOMEM; ++ ++ vecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; ++ vecls_debug("nr_cpu_ids:%d, vecls_cpu_mask:0x%x\n", nr_cpu_ids, vecls_cpu_mask); ++ ++ table->mask = size - 1; ++ for (i = 0; i < size; i++) ++ table->ents[i] = VECLS_NO_CPU; ++ ++ mutex_lock(&vecls_sock_flow_mutex); ++ rcu_assign_pointer(vecls_sock_flow_table, table); ++ mutex_unlock(&vecls_sock_flow_mutex); ++ ++ return 0; ++} ++ ++int vecls_flow_res_init(void) ++{ ++ int err; ++ ++ err = vecls_sock_flow_table_init(); ++ if (err) ++ return err; ++ err = vecls_dev_flow_table_init(); ++ if (err) ++ goto clean; ++ ++ RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops); ++ synchronize_rcu(); ++ ++ return 0; ++clean: ++ vecls_sock_flow_table_release(); ++ return err; ++} ++ ++void vecls_flow_res_clean(void) ++{ ++ RCU_INIT_POINTER(vecls_ops, NULL); ++ synchronize_rcu(); ++ vecls_sock_flow_table_release(); ++ vecls_dev_flow_table_release(); ++} +diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c +new file mode 100644 +index 000000000..80895035f +--- /dev/null ++++ b/net/venetcls/venetcls_main.c +@@ -0,0 +1,1086 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "venetcls.h" ++ ++int vecls_netdev_num; ++static struct vecls_netdev_info vecls_netdev_info_table[VECLS_MAX_NETDEV_NUM]; ++ ++int vecls_numa_num; ++static int vecls_cluster_cpu_num, vecls_cluster_per_numa; ++static struct vecls_numa_info *vecls_numa_info_table; ++ ++int debug; ++module_param(debug, int, 0644); ++MODULE_PARM_DESC(debug, "debug switch"); ++ ++static int mode; ++module_param(mode, int, 0444); ++MODULE_PARM_DESC(mode, "mode, default 0"); ++ ++static char ifname[64] = { 0 }; ++module_param_string(ifname, ifname, sizeof(ifname), 0444); ++MODULE_PARM_DESC(ifname, "ifname"); ++ ++static char appname[64] = "redis-server"; ++module_param_string(appname, appname, sizeof(appname), 0644); ++MODULE_PARM_DESC(appname, "appname, default redis-server"); ++ ++int match_ip_flag = 1; ++module_param(match_ip_flag, int, 0644); ++MODULE_PARM_DESC(match_ip_flag, "match ip flag"); ++ ++static int strategy; ++module_param(strategy, int, 0444); ++MODULE_PARM_DESC(strategy, "strategy, default 0"); ++ ++static bool check_params(void) ++{ ++ if (mode != 0 && mode != 1) ++ return false; ++ ++ if (strlen(ifname) == 0) ++ return false; ++ ++ return true; ++} ++ ++int check_appname(char *task_name) ++{ ++ char *start = appname, *end; ++ ++ if (!strlen(appname)) ++ return 0; ++ ++ // support appname: app1#app2#appN ++ while (*start != '\0') { ++ end = strchr(start, '#'); ++ if (end == start) { ++ start++; ++ continue; ++ } ++ ++ if (!end) { ++ if (!strncmp(task_name, start, strlen(start))) ++ return 0; ++ break; ++ } ++ ++ if (!strncmp(task_name, start, end - start)) ++ return 0; ++ start = end + 1; ++ } ++ return -EOPNOTSUPP; ++} ++ ++static u32 __ethtool_get_flags(struct net_device *dev) ++{ ++ u32 flags = 0; ++ ++ if (dev->features & NETIF_F_LRO) ++ flags |= ETH_FLAG_LRO; ++ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) ++ flags |= ETH_FLAG_RXVLAN; ++ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) ++ flags |= ETH_FLAG_TXVLAN; ++ if (dev->features & NETIF_F_NTUPLE) ++ flags |= ETH_FLAG_NTUPLE; ++ if (dev->features & NETIF_F_RXHASH) ++ flags |= ETH_FLAG_RXHASH; ++ ++ return flags; ++} ++ ++static int __ethtool_set_flags(struct net_device *dev, u32 data) ++{ ++ netdev_features_t features = 0, changed; ++ ++ if (data & ~ETH_ALL_FLAGS) ++ return -EINVAL; ++ ++ if (data & ETH_FLAG_LRO) ++ features |= NETIF_F_LRO; ++ if (data & ETH_FLAG_RXVLAN) ++ features |= NETIF_F_HW_VLAN_CTAG_RX; ++ if (data & ETH_FLAG_TXVLAN) ++ features |= NETIF_F_HW_VLAN_CTAG_TX; ++ if (data & ETH_FLAG_NTUPLE) ++ features |= NETIF_F_NTUPLE; ++ if (data & ETH_FLAG_RXHASH) ++ features |= NETIF_F_RXHASH; ++ ++ /* allow changing only bits set in hw_features */ ++ changed = (features ^ dev->features) & ETH_ALL_FEATURES; ++ if (changed & ~dev->hw_features) ++ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; ++ ++ dev->wanted_features = ++ (dev->wanted_features & ~changed) | (features & changed); ++ ++ __netdev_update_features(dev); ++ ++ return 0; ++} ++ ++static void ethtool_rxnfc_copy_to_user(void *useraddr, ++ const struct ethtool_rxnfc *rxnfc, ++ size_t size, const u32 *rule_buf) ++{ ++ memcpy_r(useraddr, rxnfc, size); ++ useraddr += offsetof(struct ethtool_rxnfc, rule_locs); ++ ++ if (rule_buf) ++ memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32)); ++} ++ ++static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, ++ u32 cmd, void *useraddr) ++{ ++ struct ethtool_rxnfc info; ++ size_t info_size = sizeof(info); ++ int rc; ++ ++ if (!dev->ethtool_ops->set_rxnfc) ++ return -EOPNOTSUPP; ++ ++ if (cmd == ETHTOOL_SRXFH) ++ info_size = (offsetof(struct ethtool_rxnfc, data) + ++ sizeof(info.data)); ++ ++ memcpy_r(&info, useraddr, info_size); ++ rc = dev->ethtool_ops->set_rxnfc(dev, &info); ++ if (rc) ++ return rc; ++ ++ if (cmd == ETHTOOL_SRXCLSRLINS) ++ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); ++ ++ return 0; ++} ++ ++static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, ++ u32 cmd, void *useraddr) ++{ ++ struct ethtool_rxnfc info; ++ size_t info_size = sizeof(info); ++ const struct ethtool_ops *ops = dev->ethtool_ops; ++ int ret; ++ void *rule_buf = NULL; ++ ++ if (!ops->get_rxnfc) ++ return -EOPNOTSUPP; ++ ++ if (cmd == ETHTOOL_GRXFH) ++ info_size = (offsetof(struct ethtool_rxnfc, data) + ++ sizeof(info.data)); ++ ++ memcpy_r(&info, useraddr, info_size); ++ ++ /* If FLOW_RSS was requested then user-space must be using the ++ * new definition, as FLOW_RSS is newer. ++ */ ++ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { ++ info_size = sizeof(info); ++ memcpy_r(&info, useraddr, info_size); ++ /* Since malicious users may modify the original data, ++ * we need to check whether FLOW_RSS is still requested. ++ */ ++ if (!(info.flow_type & FLOW_RSS)) ++ return -EINVAL; ++ } ++ ++ if (info.cmd != cmd) ++ return -EINVAL; ++ ++ if (info.cmd == ETHTOOL_GRXCLSRLALL) { ++ if (info.rule_cnt > 0) { ++ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) ++ rule_buf = kcalloc(info.rule_cnt, sizeof(u32), ++ GFP_KERNEL); ++ if (!rule_buf) ++ return -ENOMEM; ++ } ++ } ++ ++ ret = ops->get_rxnfc(dev, &info, rule_buf); ++ if (ret < 0) ++ goto err_out; ++ ++ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); ++err_out: ++ kfree(rule_buf); ++ ++ return ret; ++} ++ ++static noinline_for_stack int ethtool_get_channels(struct net_device *dev, ++ void *useraddr) ++{ ++ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; ++ ++ if (!dev->ethtool_ops->get_channels) ++ return -EOPNOTSUPP; ++ ++ dev->ethtool_ops->get_channels(dev, &channels); ++ ++ memcpy_r(useraddr, &channels, sizeof(channels)); ++ return 0; ++} ++ ++static int ethtool_get_value(struct net_device *dev, char *useraddr, ++ u32 cmd, u32 (*actor)(struct net_device *)) ++{ ++ struct ethtool_value edata = { .cmd = cmd }; ++ ++ if (!actor) ++ return -EOPNOTSUPP; ++ ++ edata.data = actor(dev); ++ ++ memcpy_r(useraddr, &edata, sizeof(edata)); ++ return 0; ++} ++ ++static int ethtool_set_value(struct net_device *dev, char *useraddr, ++ int (*actor)(struct net_device *, u32)) ++{ ++ struct ethtool_value edata; ++ ++ if (!actor) ++ return -EOPNOTSUPP; ++ ++ memcpy_r(&edata, useraddr, sizeof(edata)); ++ ++ return actor(dev, edata.data); ++} ++ ++static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) ++{ ++ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); ++ void *useraddr = ifr->ifr_data; ++ u32 ethcmd, sub_cmd; ++ int rc; ++ netdev_features_t old_features; ++ ++ if (!dev || !netif_device_present(dev)) ++ return -ENODEV; ++ ++ memcpy_r(ðcmd, useraddr, sizeof(ethcmd)); ++ ++ if (ethcmd == ETHTOOL_PERQUEUE) ++ memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)); ++ else ++ sub_cmd = ethcmd; ++ ++ if (dev->ethtool_ops->begin) { ++ rc = dev->ethtool_ops->begin(dev); ++ if (rc < 0) ++ return rc; ++ } ++ old_features = dev->features; ++ ++ switch (ethcmd) { ++ case ETHTOOL_GFLAGS: ++ rc = ethtool_get_value(dev, useraddr, ethcmd, ++ __ethtool_get_flags); ++ break; ++ case ETHTOOL_SFLAGS: ++ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); ++ break; ++ case ETHTOOL_GRXFH: ++ case ETHTOOL_GRXRINGS: ++ case ETHTOOL_GRXCLSRLCNT: ++ case ETHTOOL_GRXCLSRULE: ++ case ETHTOOL_GRXCLSRLALL: ++ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); ++ break; ++ case ETHTOOL_SRXFH: ++ case ETHTOOL_SRXCLSRLDEL: ++ case ETHTOOL_SRXCLSRLINS: ++ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); ++ break; ++ case ETHTOOL_GCHANNELS: ++ rc = ethtool_get_channels(dev, useraddr); ++ break; ++ default: ++ rc = -EOPNOTSUPP; ++ } ++ ++ if (dev->ethtool_ops->complete) ++ dev->ethtool_ops->complete(dev); ++ ++ if (old_features != dev->features) ++ netdev_features_change(dev); ++ ++ return rc; ++} ++ ++int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd) ++{ ++ struct ifreq ifr = {0}; ++ int ret; ++ ++ strncpy(ifr.ifr_name, ctx->netdev, IFNAMSIZ); ++ ifr.ifr_data = cmd; ++ ++ rtnl_lock(); ++ ret = dev_ethtool_kern(&init_net, &ifr); ++ rtnl_unlock(); ++ ++ return ret; ++} ++ ++struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index) ++{ ++ if (index >= VECLS_MAX_NETDEV_NUM) ++ return NULL; ++ return &vecls_netdev_info_table[index]; ++} ++ ++static struct vecls_netdev_info *alloc_vecls_netdev_info(void) ++{ ++ if (vecls_netdev_num >= VECLS_MAX_NETDEV_NUM) ++ return NULL; ++ ++ return &vecls_netdev_info_table[vecls_netdev_num++]; ++} ++ ++static bool check_irq_name(const char *irq_name, struct vecls_netdev_info *vecls_dev) ++{ ++ if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx")) ++ return false; ++ ++ if (strstr(irq_name, vecls_dev->dev_name)) ++ return true; ++ ++ if (vecls_dev->netdev->dev.parent && ++ strstr(irq_name, dev_name(vecls_dev->netdev->dev.parent))) ++ return true; ++ ++ return false; ++} ++ ++static void get_netdev_queue_info(struct vecls_netdev_info *vecls_dev) ++{ ++ struct vecls_netdev_queue_info *rxq_info; ++ struct irq_desc *desc; ++ int irq, cpu; ++ ++ for_each_irq_desc(irq, desc) { ++ if (!desc->action) ++ continue; ++ if (!desc->action->name) ++ continue; ++ if (!check_irq_name(desc->action->name, vecls_dev)) ++ continue; ++ if (vecls_dev->rxq_num >= VECLS_MAX_RXQ_NUM_PER_DEV) ++ break; ++ rxq_info = &vecls_dev->rxq[vecls_dev->rxq_num++]; ++ rxq_info->irq = irq; ++ cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); ++ rxq_info->affinity_cpu = cpu; ++ vecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n", ++ irq, desc->action->name, vecls_dev->rxq_num, cpu); ++ } ++} ++ ++static int vecls_filter_enable(const char *dev_name, bool *old_state) ++{ ++ struct ethtool_value eval = {0}; ++ struct cmd_context ctx = {0}; ++ int ret; ++ ++ strncpy(ctx.netdev, dev_name, IFNAMSIZ); ++ ++ eval.cmd = ETHTOOL_GFLAGS; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); ++ return ret; ++ } ++ if (eval.data & ETH_FLAG_NTUPLE) { ++ *old_state = true; ++ vecls_debug("%s ntuple is already on\n", dev_name); ++ return 0; ++ } ++ ++ // Set ntuple feature ++ eval.cmd = ETHTOOL_SFLAGS; ++ eval.data |= ETH_FLAG_NTUPLE; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ vecls_error("set %s flags fail, ret:%d\n", dev_name, ret); ++ return ret; ++ } ++ ++ // Get ntuple feature ++ eval.cmd = ETHTOOL_GFLAGS; ++ eval.data = 0; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); ++ return ret; ++ } ++ if (!(eval.data & ETH_FLAG_NTUPLE)) { ++ vecls_error("enable ntuple feature fail!\n"); ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++static void vecls_filter_restore(const char *dev_name, bool old_state) ++{ ++ struct ethtool_value eval = {0}; ++ struct cmd_context ctx = {0}; ++ bool cur_filter_state; ++ int ret; ++ ++ strncpy(ctx.netdev, dev_name, IFNAMSIZ); ++ ++ eval.cmd = ETHTOOL_GFLAGS; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ vecls_error("get %s flags fail, ret:%d\n", dev_name, ret); ++ return; ++ } ++ ++ cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false; ++ if (cur_filter_state == old_state) ++ return; ++ ++ // Set ntuple feature ++ eval.cmd = ETHTOOL_SFLAGS; ++ if (old_state) ++ eval.data |= ETH_FLAG_NTUPLE; ++ else ++ eval.data &= ~ETH_FLAG_NTUPLE; ++ ret = send_ethtool_ioctl(&ctx, &eval); ++ if (ret != 0) { ++ vecls_error("set %s flags fail, ret:%d\n", dev_name, ret); ++ return; ++ } ++} ++ ++static int init_single_vecls_dev(char *if_name, unsigned int length) ++{ ++ struct vecls_netdev_info *vecls_dev; ++ char dev_name[IFNAMSIZ] = { 0 }; ++ struct net_device *netdev; ++ int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ; ++ bool old_state = false; ++ int ret; ++ ++ strncpy(dev_name, if_name, cpy_len); ++ netdev = dev_get_by_name(&init_net, dev_name); ++ if (!netdev) { ++ vecls_error("dev [%s] is not exist!\n", dev_name); ++ return -ENODEV; ++ } ++ ++ if (!(netdev->flags & IFF_UP)) { ++ ret = -ENETDOWN; ++ vecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags); ++ goto out; ++ } ++ ++ if (netdev->flags & IFF_LOOPBACK) { ++ ret = -EOPNOTSUPP; ++ vecls_error("Do not support loopback.\n"); ++ goto out; ++ } ++ ++ ret = vecls_filter_enable(dev_name, &old_state); ++ if (ret) { ++ vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); ++ goto out; ++ } ++ ++ vecls_dev = alloc_vecls_netdev_info(); ++ if (!vecls_dev) { ++ ret = -ENOMEM; ++ vecls_filter_restore(dev_name, old_state); ++ vecls_error("alloc vecls_dev fail! vecls_netdev_num:%d\n", vecls_netdev_num); ++ goto out; ++ } ++ ++ memcpy_r(vecls_dev->dev_name, dev_name, IFNAMSIZ); ++ vecls_dev->old_filter_state = old_state; ++ vecls_dev->netdev = netdev; ++ get_netdev_queue_info(vecls_dev); ++ return 0; ++ ++out: ++ dev_put(netdev); ++ return ret; ++} ++ ++static void clean_vecls_netdev_info(void) ++{ ++ struct vecls_netdev_info *vecls_dev; ++ struct net_device *netdev; ++ int devid; ++ ++ for_each_vecls_netdev(devid, vecls_dev) { ++ vecls_filter_restore(vecls_dev->dev_name, vecls_dev->old_filter_state); ++ netdev = vecls_dev->netdev; ++ if (netdev) { ++ vecls_dev->netdev = NULL; ++ dev_put(netdev); ++ } ++ } ++ ++ vecls_netdev_num = 0; ++} ++ ++static int init_vecls_netdev_info(char *netdev_str) ++{ ++ char *start = netdev_str, *end; ++ int err = -ENODEV; ++ ++ while (*start != '\0') { ++ // skip start # ++ end = strchr(start, '#'); ++ if (end == start) { ++ start++; ++ continue; ++ } ++ ++ // find the last ifname ++ if (!end) { ++ err = init_single_vecls_dev(start, strlen(start)); ++ break; ++ } ++ ++ err = init_single_vecls_dev(start, end - start); ++ if (err) ++ break; ++ start = end + 1; ++ } ++ ++ return err; ++} ++ ++struct vecls_numa_info *get_vecls_numa_info(unsigned int nid) ++{ ++ if (nid >= vecls_numa_num) ++ return NULL; ++ return &vecls_numa_info_table[nid]; ++} ++ ++static void clean_vecls_numa_info(void) ++{ ++ vecls_numa_num = 0; ++ kfree(vecls_numa_info_table); ++} ++ ++static void init_numa_avail_cpus(int nid, struct vecls_numa_info *numa_info) ++{ ++ int cpu; ++ ++ vecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)), ++ cpumask_pr_args(cpumask_of_node(nid))); ++ ++ bitmap_zero(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); ++ for_each_cpu(cpu, cpumask_of_node(nid)) { ++ if (cpu >= VECLS_MAX_CPU_NUM) ++ return; ++ set_bit(cpu, numa_info->avail_cpus); ++ } ++} ++ ++static void clean_vecls_rxq(void) ++{ ++ struct vecls_numa_bound_dev_info *bound_dev; ++ struct vecls_netdev_info *vecls_dev; ++ struct vecls_numa_info *numa_info; ++ int nid, devid; ++ ++ for_each_vecls_numa(nid, numa_info) { ++ for_each_vecls_netdev(devid, vecls_dev) { ++ bound_dev = &numa_info->bound_dev[devid]; ++ kfree(bound_dev->cluster_info); ++ } ++ } ++} ++ ++static int init_numa_rxq_bitmap(int nid, struct vecls_numa_info *numa_info) ++{ ++ int bound_rxq_num, cluster_id, cluster_idx, cur_idx; ++ struct vecls_numa_bound_dev_info *bound_dev; ++ struct vecls_netdev_info *vecls_dev; ++ int rxq_id, devid, cpu, ret = 0; ++ ++ for_each_vecls_netdev(devid, vecls_dev) { ++ bound_rxq_num = 0; ++ bound_dev = &numa_info->bound_dev[devid]; ++ bitmap_zero(bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV); ++ bound_dev->cluster_info = kzalloc(sizeof(struct vecls_numa_clusterinfo) ++ * vecls_cluster_per_numa, GFP_ATOMIC); ++ if (!bound_dev->cluster_info) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { ++ cpu = vecls_dev->rxq[rxq_id].affinity_cpu; ++ if (cpu_to_node(cpu) == nid) { ++ set_bit(rxq_id, bound_dev->bitmap_rxq); ++ cluster_id = cpu / vecls_cluster_cpu_num; ++ cluster_idx = cluster_id % vecls_cluster_per_numa; ++ bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id; ++ cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++; ++ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id; ++ bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 1; ++ bound_rxq_num++; ++ vecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n", ++ cpu, cluster_id, cluster_idx, rxq_id, cur_idx); ++ } ++ } ++ ++ vecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n", ++ nid, devid, vecls_dev->dev_name, vecls_dev->rxq_num, ++ bound_rxq_num, VECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq); ++ } ++ return ret; ++ ++out: ++ clean_vecls_rxq(); ++ return ret; ++} ++ ++static int get_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int cpu) ++{ ++ int cluster_id = cpu / vecls_cluster_cpu_num; ++ int i, j, rxq_id; ++ ++ for (i = 0; i < vecls_cluster_per_numa; i++) { ++ if (cluster_id != bound_dev->cluster_info[i].cluster_id) ++ continue; ++ for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { ++ if (bound_dev->cluster_info[i].rxqs[j].status == 1) { ++ bound_dev->cluster_info[i].rxqs[j].status = 2; ++ rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id; ++ vecls_debug("cluster:%d cpu:%d alloc rxq_id:%d\n", ++ cluster_id, cpu, rxq_id); ++ return rxq_id; ++ } ++ } ++ } ++ vecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu); ++ return -1; ++} ++ ++static int put_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int rxq_id) ++{ ++ int i, j; ++ ++ for (i = 0; i < vecls_cluster_per_numa; i++) { ++ for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) { ++ if (bound_dev->cluster_info[i].rxqs[j].status == 2 && ++ bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) { ++ bound_dev->cluster_info[i].rxqs[j].status = 1; ++ vecls_debug("free rxq_id:%d\n", rxq_id); ++ return 0; ++ } ++ } ++ } ++ vecls_debug("no match malloced rxq_id:%d\n", rxq_id); ++ return -1; ++} ++ ++int alloc_rxq_id(int nid, int cpu, int devid) ++{ ++ struct vecls_numa_bound_dev_info *bound_dev; ++ struct vecls_numa_info *numa_info; ++ int rxq_id; ++ ++ numa_info = get_vecls_numa_info(nid); ++ if (!numa_info) { ++ vecls_error("error nid:%d\n", nid); ++ return -EINVAL; ++ } ++ ++ if (devid >= VECLS_MAX_NETDEV_NUM) { ++ vecls_error("error bound_dev index:%d\n", devid); ++ return -EINVAL; ++ } ++ bound_dev = &numa_info->bound_dev[devid]; ++ ++ if (strategy == 1) { ++ rxq_id = get_cluster_rxq(bound_dev, cpu); ++ if (rxq_id < 0 || rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) ++ vecls_debug("failed to get rxq_id:%d in cluster, try numa\n", rxq_id); ++ else ++ goto found; ++ } ++ ++ rxq_id = find_first_bit(bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV); ++ if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) { ++ vecls_error("error rxq_id:%d\n", rxq_id); ++ return -EINVAL; ++ } ++ ++found: ++ clear_bit(rxq_id, bound_dev->bitmap_rxq); ++ vecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); ++ return rxq_id; ++} ++ ++void free_rxq_id(int nid, int devid, int rxq_id) ++{ ++ struct vecls_numa_bound_dev_info *bound_dev; ++ struct vecls_numa_info *numa_info; ++ ++ numa_info = get_vecls_numa_info(nid); ++ if (!numa_info) { ++ vecls_error("error nid:%d\n", nid); ++ return; ++ } ++ ++ if (devid >= VECLS_MAX_NETDEV_NUM) { ++ vecls_error("error bound_dev index:%d\n", devid); ++ return; ++ } ++ bound_dev = &numa_info->bound_dev[devid]; ++ ++ if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) { ++ vecls_error("error rxq_id:%d\n", rxq_id); ++ return; ++ } ++ ++ if (strategy == 1) ++ put_cluster_rxq(bound_dev, rxq_id); ++ ++ if (test_bit(rxq_id, bound_dev->bitmap_rxq)) { ++ vecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id); ++ return; ++ } ++ ++ set_bit(rxq_id, bound_dev->bitmap_rxq); ++ vecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); ++} ++ ++static int init_vecls_numa_info(void) ++{ ++ struct vecls_numa_info *numa_info; ++ int nid, ret = 0; ++ ++ vecls_numa_num = num_online_nodes(); ++ vecls_numa_info_table = kzalloc(sizeof(struct vecls_numa_info) * vecls_numa_num, GFP_ATOMIC); ++ if (!vecls_numa_info_table) { ++ ret = -ENOMEM; ++ vecls_error("vecls_numa_info_table alloc failed:%d\n", ret); ++ return ret; ++ } ++ ++ vecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(raw_smp_processor_id())); ++ vecls_cluster_per_numa = (nr_cpu_ids / vecls_cluster_cpu_num) / vecls_numa_num; ++ vecls_debug("vecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n", ++ vecls_numa_num, vecls_cluster_per_numa, vecls_cluster_cpu_num); ++ ++ for_each_vecls_numa(nid, numa_info) ++ init_numa_avail_cpus(nid, numa_info); ++ ++ return ret; ++} ++ ++static int alloc_available_cpu(int nid, struct vecls_numa_info *numa_info) ++{ ++ int cpu; ++ ++ cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); ++ if (cpu >= VECLS_MAX_CPU_NUM) { ++ vecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu); ++ return -1; ++ } ++ ++ clear_bit(cpu, numa_info->avail_cpus); ++ return cpu; ++} ++ ++static void add_netdev_irq_affinity_cpu(struct vecls_netdev_info *vecls_dev, int rxq_id, int cpu) ++{ ++ struct vecls_netdev_queue_info *rxq_info; ++ ++ if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) ++ return; ++ ++ rxq_info = &vecls_dev->rxq[rxq_id]; ++ rxq_info->affinity_cpu = cpu; ++} ++ ++static void config_affinity_strategy_default(struct vecls_netdev_info *vecls_dev) ++{ ++ struct vecls_numa_info *numa_info; ++ int rxq_num = vecls_dev->rxq_num; ++ int rxq_per_numa = rxq_num / vecls_numa_num; ++ int remain = rxq_num - rxq_per_numa * vecls_numa_num; ++ int numa_rxq_id, rxq_id, nid, cpu; ++ ++ vecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", vecls_dev->dev_name, ++ rxq_num, rxq_per_numa, remain); ++ ++ // average config rxq to every numa ++ for_each_vecls_numa(nid, numa_info) { ++ for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) { ++ cpu = alloc_available_cpu(nid, numa_info); ++ if (cpu < 0) ++ break; ++ ++ rxq_id = rxq_per_numa * nid + numa_rxq_id; ++ add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu); ++ } ++ } ++ ++ if (!remain) ++ return; ++ ++ // config remain rxq to every numa ++ numa_rxq_id = 0; ++ for_each_vecls_numa(nid, numa_info) { ++ if (numa_rxq_id >= remain) ++ break; ++ cpu = alloc_available_cpu(nid, numa_info); ++ if (cpu < 0) ++ break; ++ ++ rxq_id = rxq_per_numa * vecls_numa_num + numa_rxq_id; ++ numa_rxq_id++; ++ add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu); ++ } ++} ++ ++static void config_affinity_strategy_cluster(struct vecls_netdev_info *vecls_dev) ++{ ++ int rxq_num = vecls_dev->rxq_num; ++ int rxq_per_numa = rxq_num / vecls_numa_num; ++ int remain = rxq_num - rxq_per_numa * vecls_numa_num; ++ int cpu_idx = vecls_cluster_cpu_num - 1; ++ int cluster, cpu, rxq_id = 0, round; ++ ++ round = rxq_per_numa < vecls_cluster_per_numa ? rxq_per_numa : vecls_cluster_per_numa; ++ if (remain > 0) ++ round++; ++ vecls_debug("round=%d\n", round); ++ ++ while (rxq_id < vecls_dev->rxq_num) { ++ for (cluster = 0; cluster < vecls_cluster_per_numa * vecls_numa_num; cluster++) { ++ if (cluster % vecls_cluster_per_numa >= round) ++ continue; ++ cpu = cluster * vecls_cluster_cpu_num + cpu_idx; ++ if (rxq_id >= vecls_dev->rxq_num) ++ break; ++ add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); ++ } ++ cpu_idx--; ++ if (--cpu_idx < 0) ++ cpu_idx = vecls_cluster_cpu_num - 1; ++ } ++} ++ ++static void config_affinity_strategy_numa(struct vecls_netdev_info *vecls_dev) ++{ ++ int rxq_num = vecls_dev->rxq_num; ++ int rxq_per_numa = rxq_num / vecls_numa_num; ++ int cpu_per_numa = nr_cpu_ids / vecls_numa_num; ++ int remain = rxq_num - rxq_per_numa * vecls_numa_num; ++ struct vecls_numa_info *numa_info; ++ int numa_start_cpu, numa_cpu_id; ++ int rxq_id = 0, nid, cpu; ++ ++ for_each_vecls_numa(nid, numa_info) { ++ numa_start_cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM); ++ for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) { ++ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); ++ if (rxq_id >= vecls_dev->rxq_num) ++ break; ++ add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); ++ } ++ if (remain-- > 0) { ++ cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); ++ add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu); ++ } ++ } ++} ++ ++static void config_affinity_strategy_custom(struct vecls_netdev_info *vecls_dev) ++{ ++ vecls_debug("dev=%s\n", vecls_dev->dev_name); ++} ++ ++static void config_affinity_strategy(void) ++{ ++ struct vecls_netdev_info *vecls_dev; ++ int devid; ++ ++ for_each_vecls_netdev(devid, vecls_dev) { ++ switch (strategy) { ++ case 1: ++ config_affinity_strategy_cluster(vecls_dev); ++ break; ++ case 2: ++ config_affinity_strategy_numa(vecls_dev); ++ break; ++ case 3: ++ config_affinity_strategy_custom(vecls_dev); ++ break; ++ case 0: ++ default: ++ config_affinity_strategy_default(vecls_dev); ++ break; ++ } ++ } ++} ++ ++static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu) ++{ ++ int err = 0; ++ ++ err = irq_set_affinity(irq, get_cpu_mask(cpu)); ++ vecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err); ++} ++ ++static void enable_affinity_strategy(void) ++{ ++ struct vecls_netdev_queue_info *rxq_info; ++ struct vecls_netdev_info *vecls_dev; ++ int rxq_id, devid; ++ ++ for_each_vecls_netdev(devid, vecls_dev) { ++ for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { ++ rxq_info = &vecls_dev->rxq[rxq_id]; ++ irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu); ++ } ++ } ++} ++ ++static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id, ++ const struct cpumask *cpu_mask) ++{ ++ int err = 0; ++ ++ err = netif_set_xps_queue(netdev, cpu_mask, rxq_id); ++ vecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id, ++ cpumask_pr_args(cpu_mask), err); ++} ++ ++static void set_netdev_xps_queue(bool enable) ++{ ++ const struct cpumask clear_mask = { 0 }; ++ struct vecls_netdev_info *vecls_dev; ++ const struct cpumask *cpu_mask; ++ int rxq_id, devid, cpu, nid; ++ ++ for_each_vecls_netdev(devid, vecls_dev) { ++ for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) { ++ cpu = vecls_dev->rxq[rxq_id].affinity_cpu; ++ nid = cpu_to_node(cpu); ++ if (enable) ++ cpu_mask = cpumask_of_node(nid); ++ else ++ cpu_mask = &clear_mask; ++ ++ netif_set_xps_queue_wrapper(vecls_dev->netdev, rxq_id, cpu_mask); ++ } ++ } ++} ++ ++static int __maybe_unused venetcls_status_seq_show(struct seq_file *seq, void *v) ++{ ++ int err; ++ ++ if (mode == 0) ++ err = venetcls_ntuple_status(seq, v); ++ else ++ err = venetcls_flow_status(seq, v); ++ return err; ++} ++ ++static __init int vecls_init(void) ++{ ++ struct vecls_numa_info *numa_info; ++ int nid, err; ++ ++ if (!check_params()) ++ return -EINVAL; ++ ++ err = init_vecls_numa_info(); ++ if (err) ++ return err; ++ ++ err = init_vecls_netdev_info(ifname); ++ if (err) ++ goto clean_numa; ++ ++ // Set irq affinity ++ config_affinity_strategy(); ++ enable_affinity_strategy(); ++ ++ // Calculate rxq bounded to one numa ++ for_each_vecls_numa(nid, numa_info) { ++ err = init_numa_rxq_bitmap(nid, numa_info); ++ if (err) ++ goto clean_rxq; ++ } ++ ++#ifdef CONFIG_XPS ++ set_netdev_xps_queue(true); ++#endif ++ ++ if (mode == 0) ++ err = vecls_ntuple_res_init(); ++ else ++ err = vecls_flow_res_init(); ++ ++ if (err) ++ goto clean_rxq; ++ ++#ifdef CONFIG_PROC_FS ++ if (!proc_create_net_single("venet_status", 0444, init_net.proc_net, ++ venetcls_status_seq_show, NULL)) { ++ err = -ENOMEM; ++ goto clean_rxq; ++ } ++#endif ++ ++ return 0; ++ ++clean_rxq: ++clean_numa: ++ clean_vecls_netdev_info(); ++ clean_vecls_numa_info(); ++ return err; ++} ++ ++static __exit void vecls_exit(void) ++{ ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("venet_status", init_net.proc_net); ++#endif ++ if (mode == 0) ++ vecls_ntuple_res_clean(); ++ else ++ vecls_flow_res_clean(); ++ ++#ifdef CONFIG_XPS ++ set_netdev_xps_queue(false); ++#endif ++ ++ clean_vecls_rxq(); ++ clean_vecls_netdev_info(); ++ clean_vecls_numa_info(); ++} ++ ++module_init(vecls_init); ++module_exit(vecls_exit); ++ ++MODULE_DESCRIPTION("venetcls"); ++MODULE_LICENSE("GPL v2"); +diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c +new file mode 100644 +index 000000000..135e2e049 +--- /dev/null ++++ b/net/venetcls/venetcls_ntuple.c +@@ -0,0 +1,643 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "venetcls.h" ++ ++struct vecls_sk_rule_list vecls_sk_rules, vecls_sk_list; ++static struct workqueue_struct *do_cfg_workqueue; ++static atomic_t vecls_worker_count = ATOMIC_INIT(0); ++ ++static void init_vecls_sk_rules(void) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) ++ INIT_HLIST_HEAD(vecls_sk_rules.hash + i); ++ mutex_init(&vecls_sk_rules.mutex); ++} ++ ++static inline struct hlist_head *get_rule_hashlist(u32 dip4, u16 dport) ++{ ++ return vecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & VECLS_SK_RULE_HASHMASK); ++} ++ ++static inline struct hlist_head *get_sk_hashlist(void *sk) ++{ ++ return vecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & VECLS_SK_RULE_HASHMASK); ++} ++ ++static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, ++ int ruleid, int nid) ++{ ++ struct hlist_head *hlist = get_rule_hashlist(dip4, dport); ++ struct hlist_head *sk_hlist = get_sk_hashlist(sk); ++ struct vecls_sk_rule *rule; ++ struct vecls_sk_entry *entry; ++ ++ rule = kzalloc(sizeof(struct vecls_sk_rule), GFP_ATOMIC); ++ entry = kzalloc(sizeof(struct vecls_sk_entry), GFP_ATOMIC); ++ if (!rule || !entry) ++ goto out; ++ ++ rule->sk = sk; ++ rule->dip4 = dip4; ++ rule->dport = dport; ++ rule->devid = devid; ++ rule->action = action; ++ rule->ruleid = ruleid; ++ rule->nid = nid; ++ hlist_add_head(&rule->node, hlist); ++ ++ entry->sk = sk; ++ entry->sk_rule_hash = jhash_2words(dip4, dport, 0); ++ hlist_add_head(&entry->node, sk_hlist); ++ return; ++out: ++ vecls_debug("alloc failed rule:%p entry:%p\n", rule, entry); ++ kfree(entry); ++ kfree(rule); ++} ++ ++static struct vecls_sk_entry *get_sk_entry(void *sk) ++{ ++ struct hlist_head *sk_hlist = get_sk_hashlist(sk); ++ struct vecls_sk_entry *entry = NULL; ++ ++ hlist_for_each_entry(entry, sk_hlist, node) { ++ if (entry->sk == sk) ++ break; ++ } ++ return entry; ++} ++ ++static void del_sk_rule(struct vecls_sk_rule *rule) ++{ ++ struct vecls_sk_entry *entry; ++ ++ entry = get_sk_entry(rule->sk); ++ if (!entry) ++ return; ++ hlist_del_init(&entry->node); ++ kfree(entry); ++ ++ vecls_debug("del rule=%p\n", rule); ++ hlist_del_init(&rule->node); ++ kfree(rule); ++} ++ ++static struct vecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport) ++{ ++ struct hlist_head *hlist = get_rule_hashlist(dip4, dport); ++ struct vecls_sk_rule *rule = NULL; ++ ++ hlist_for_each_entry(rule, hlist, node) { ++ if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport) ++ break; ++ } ++ return rule; ++} ++ ++static struct vecls_sk_rule *get_rule_from_sk(int devid, void *sk) ++{ ++ struct vecls_sk_rule *rule = NULL; ++ struct vecls_sk_entry *entry; ++ struct hlist_head *hlist; ++ ++ entry = get_sk_entry(sk); ++ if (!entry) ++ return NULL; ++ ++ hlist = vecls_sk_rules.hash + (entry->sk_rule_hash & VECLS_SK_RULE_HASHMASK); ++ hlist_for_each_entry(rule, hlist, node) { ++ if (rule->devid == devid && rule->sk == sk) ++ break; ++ } ++ return rule; ++} ++ ++static inline bool reuseport_check(int devid, u32 dip4, u16 dport) ++{ ++ return !!get_sk_rule(devid, dip4, dport); ++} ++ ++static u32 get_first_ip4_addr(struct net *net) ++{ ++ struct in_device *in_dev; ++ struct net_device *dev; ++ struct in_ifaddr *ifa; ++ u32 dip4 = 0; ++ ++ rtnl_lock(); ++ rcu_read_lock(); ++ for_each_netdev(net, dev) { ++ if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) ++ continue; ++ in_dev = __in_dev_get_rcu(dev); ++ if (!in_dev) ++ continue; ++ ++ in_dev_for_each_ifa_rcu(ifa, in_dev) { ++ if (!strcmp(dev->name, ifa->ifa_label)) { ++ dip4 = ifa->ifa_local; ++ vecls_debug("dev: %s, dip4:%pI4\n", dev->name, &dip4); ++ goto out; ++ } ++ } ++ } ++out: ++ rcu_read_unlock(); ++ rtnl_unlock(); ++ return dip4; ++} ++ ++static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport) ++{ ++ *dport = htons(sk->sk_num); ++ ++ if (!match_ip_flag) { ++ *dip4 = 0; ++ return; ++ } ++ ++ if (sk->sk_rcv_saddr) ++ *dip4 = sk->sk_rcv_saddr; ++ else ++ *dip4 = get_first_ip4_addr(sock_net(sk)); ++} ++ ++static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc) ++{ ++ struct ethtool_rxnfc nfccmd; ++ int err; ++ ++ nfccmd.cmd = ETHTOOL_SRXCLSRLDEL; ++ nfccmd.fs.location = loc; ++ err = send_ethtool_ioctl(ctx, &nfccmd); ++ if (err < 0) ++ vecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc); ++ return err; ++} ++ ++static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc) ++{ ++ if (loc >= rmgr->size) { ++ vecls_error("rmgr: Location out of range\n"); ++ return -1; ++ } ++ ++ set_bit(loc, rmgr->slot); ++ return 0; ++} ++ ++static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp) ++{ ++ __u32 loc, slot_num; ++ ++ if (rmgr->driver_select) ++ return 0; ++ ++ loc = rmgr->size - 1; ++ slot_num = loc / BITS_PER_LONG; ++ if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) { ++ loc -= 1 + (loc % BITS_PER_LONG); ++ slot_num--; ++ } ++ ++ while (loc < rmgr->size && !~(rmgr->slot[slot_num])) { ++ loc -= BITS_PER_LONG; ++ slot_num--; ++ } ++ ++ while (loc < rmgr->size && test_bit(loc, rmgr->slot)) ++ loc--; ++ ++ if (loc < rmgr->size) { ++ fsp->location = loc; ++ return rmgr_ins(rmgr, loc); ++ } ++ ++ return -1; ++} ++ ++static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select) ++{ ++ struct ethtool_rxnfc nfccmd; ++ int err; ++ ++ nfccmd.cmd = ETHTOOL_GRXCLSRLCNT; ++ nfccmd.data = 0; ++ err = send_ethtool_ioctl(ctx, &nfccmd); ++ *count = nfccmd.rule_cnt; ++ if (driver_select) ++ *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL); ++ if (err < 0) ++ vecls_debug("rxclass: Cannot get RX class rule count\n"); ++ ++ return err; ++} ++ ++static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr) ++{ ++ struct ethtool_rxnfc *nfccmd; ++ __u32 *rule_locs; ++ int i, err = 0; ++ ++ memset(rmgr, 0, sizeof(*rmgr)); ++ err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select); ++ if (err < 0) ++ return err; ++ ++ if (rmgr->driver_select) ++ return err; ++ ++ nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC); ++ if (!nfccmd) { ++ vecls_error("rmgr: Cannot allocate memory for RX class rule locations\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ nfccmd->cmd = ETHTOOL_GRXCLSRLALL; ++ nfccmd->rule_cnt = rmgr->n_rules; ++ err = send_ethtool_ioctl(ctx, nfccmd); ++ if (err < 0) { ++ vecls_debug("rmgr: Cannot get RX class rules\n"); ++ goto out; ++ } ++ ++ rmgr->size = nfccmd->data; ++ if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) { ++ vecls_error("rmgr: Invalid RX class rules table size\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC); ++ if (!rmgr->slot) { ++ vecls_error("rmgr: Cannot allocate memory for RX class rules\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ rule_locs = nfccmd->rule_locs; ++ for (i = 0; i < rmgr->n_rules; i++) { ++ err = rmgr_ins(rmgr, rule_locs[i]); ++ if (err < 0) ++ break; ++ } ++ ++out: ++ kfree(nfccmd); ++ return err; ++} ++ ++static void rmgr_cleanup(struct rmgr_ctrl *rmgr) ++{ ++ kfree(rmgr->slot); ++ rmgr->slot = NULL; ++ rmgr->size = 0; ++} ++ ++static int rmgr_set_location(struct cmd_context *ctx, ++ struct ethtool_rx_flow_spec *fsp) ++{ ++ struct rmgr_ctrl rmgr; ++ int ret; ++ ++ ret = rmgr_init(ctx, &rmgr); ++ if (ret < 0) ++ goto out; ++ ++ ret = rmgr_find_empty_slot(&rmgr, fsp); ++out: ++ rmgr_cleanup(&rmgr); ++ return ret; ++} ++ ++static int rxclass_rule_ins(struct cmd_context *ctx, ++ struct ethtool_rx_flow_spec *fsp, u32 rss_context) ++{ ++ struct ethtool_rxnfc nfccmd; ++ u32 loc = fsp->location; ++ int ret; ++ ++ if (loc & RX_CLS_LOC_SPECIAL) { ++ ret = rmgr_set_location(ctx, fsp); ++ if (ret < 0) ++ return ret; ++ } ++ ++ nfccmd.cmd = ETHTOOL_SRXCLSRLINS; ++ nfccmd.rss_context = rss_context; ++ nfccmd.fs = *fsp; ++ ret = send_ethtool_ioctl(ctx, &nfccmd); ++ if (ret < 0) { ++ vecls_debug("Can not insert the clasification rule\n"); ++ return ret; ++ } ++ ++ if (loc & RX_CLS_LOC_SPECIAL) ++ vecls_debug("Added rule with ID %d\n", nfccmd.fs.location); ++ ++ return 0; ++} ++ ++static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp, ++ struct ethtool_rx_ntuple_flow_spec *ntuple) ++{ ++ int i; ++ ++ memset(ntuple, ~0, sizeof(*ntuple)); ++ ntuple->flow_type = fsp->flow_type; ++ ntuple->action = fsp->ring_cookie; ++ memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u)); ++ memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u)); ++ for (i = 0; i < sizeof(ntuple->m_u); i++) ++ ntuple->m_u.hdata[i] ^= 0xFF; ++ ntuple->flow_type &= ~FLOW_EXT; ++} ++ ++static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp) ++{ ++ struct ethtool_rx_ntuple ntuplecmd; ++ struct ethtool_value eval; ++ int ret = 0; ++ ++ flow_spec_to_ntuple(fsp, &ntuplecmd.fs); ++ ++ eval.cmd = ETHTOOL_GFLAGS; ++ ret = send_ethtool_ioctl(ctx, &eval); ++ if (ret || !(eval.data & ETH_FLAG_NTUPLE)) ++ return -1; ++ ++ ntuplecmd.cmd = ETHTOOL_SRXNTUPLE; ++ ret = send_ethtool_ioctl(ctx, &ntuplecmd); ++ if (ret) ++ vecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret); ++ ++ return ret; ++} ++ ++static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) ++{ ++ struct ethtool_rx_flow_spec *fsp, rx_rule_fs; ++ u32 rss_context = 0; ++ int ret; ++ ++ vecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n", ++ is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid, ++ ctx->del_ruleid); ++ ++ if (is_del) ++ return rxclass_rule_del(ctx, ctx->del_ruleid); ++ ++ ctx->ret_loc = -1; ++ ++ fsp = &rx_rule_fs; ++ memset(fsp, 0, sizeof(*fsp)); ++ fsp->flow_type = TCP_V4_FLOW; ++ fsp->location = RX_CLS_LOC_ANY; ++ fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4; ++ fsp->h_u.tcp_ip4_spec.pdst = ctx->dport; ++ if (ctx->dip4) ++ fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL; ++ fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL; ++ if (ctx->ruleid) ++ fsp->location = ctx->ruleid; ++ fsp->ring_cookie = ctx->action; ++ ++ ret = do_srxntuple(ctx, &rx_rule_fs); ++ if (!ret) ++ return 0; ++ ++ ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context); ++ if (!ret) ++ ctx->ret_loc = rx_rule_fs.location; ++ return ret; ++} ++ ++static void cfg_work(struct work_struct *work) ++{ ++ struct cfg_param *ctx_p = container_of(work, struct cfg_param, work); ++ struct vecls_netdev_info *vecls_dev; ++ struct vecls_sk_rule *rule; ++ int devid, rxq_id, err; ++ ++ mutex_lock(&vecls_sk_rules.mutex); ++ for_each_vecls_netdev(devid, vecls_dev) { ++ strncpy(ctx_p->ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); ++ if (ctx_p->is_del == false) { ++ if (reuseport_check(devid, ctx_p->ctx.dip4, ctx_p->ctx.dport)) { ++ vecls_debug("dip4:%pI4, dport:%d reuse!\n", &(ctx_p->ctx.dip4), ntohs(ctx_p->ctx.dport)); ++ continue; ++ } ++ ++ // Calculate the bound queue ++ rxq_id = alloc_rxq_id(ctx_p->nid, ctx_p->cpu, devid); ++ if (rxq_id < 0) ++ continue; ++ ++ // Config Ntuple rule to dev ++ ctx_p->ctx.action = (u16)rxq_id; ++ err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); ++ if (err) { ++ vecls_debug("Add sk:%p, dev_id:%d, rxq:%d, err:%d\n", ctx_p->sk, devid, rxq_id, err); ++ free_rxq_id(ctx_p->nid, devid, rxq_id); ++ continue; ++ } ++ add_sk_rule(devid, ctx_p->ctx.dip4, ctx_p->ctx.dport, ++ ctx_p->sk, ctx_p->ctx.action, ctx_p->ctx.ret_loc, ctx_p->nid); ++ } else { ++ rule = get_rule_from_sk(devid, ctx_p->sk); ++ if (!rule) { ++ vecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", ++ ctx_p->sk, devid, &ctx_p->ctx.dip4, ntohs(ctx_p->ctx.dport)); ++ continue; ++ } ++ ++ // Config Ntuple rule to dev ++ ctx_p->ctx.del_ruleid = rule->ruleid; ++ err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); ++ // Free the bound queue ++ free_rxq_id(rule->nid, devid, rule->action); ++ // Delete sk rule ++ del_sk_rule(rule); ++ } ++ } ++ mutex_unlock(&vecls_sk_rules.mutex); ++ kfree(ctx_p); ++ atomic_dec(&vecls_worker_count); ++} ++ ++static bool has_sock_rule(struct sock *sk) ++{ ++ struct vecls_netdev_info *vecls_dev; ++ struct vecls_sk_rule *rule; ++ int devid; ++ ++ for_each_vecls_netdev(devid, vecls_dev) { ++ rule = get_rule_from_sk(devid, sk); ++ if (rule) ++ return true; ++ } ++ return false; ++} ++ ++static void del_ntuple_rule(struct sock *sk) ++{ ++ struct cfg_param *ctx_p; ++ ++ if (!has_sock_rule(sk)) ++ return; ++ ++ ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); ++ if (!ctx_p) ++ return; ++ get_sk_rule_addr(sk, &(ctx_p->ctx.dip4), &(ctx_p->ctx.dport)); ++ ++ ctx_p->is_del = true; ++ ctx_p->sk = sk; ++ INIT_WORK(&ctx_p->work, cfg_work); ++ queue_work(do_cfg_workqueue, &ctx_p->work); ++ atomic_inc(&vecls_worker_count); ++} ++ ++static void add_ntuple_rule(struct sock *sk) ++{ ++ struct cfg_param *ctx_p; ++ int cpu = raw_smp_processor_id(); ++ int nid = cpu_to_node(cpu); ++ ++ if (check_appname(current->comm)) ++ return; ++ ++ ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); ++ if (!ctx_p) ++ return; ++ get_sk_rule_addr(sk, &(ctx_p->ctx.dip4), &(ctx_p->ctx.dport)); ++ ++ ctx_p->is_del = false; ++ ctx_p->sk = sk; ++ ctx_p->nid = nid; ++ ctx_p->cpu = cpu; ++ INIT_WORK(&ctx_p->work, cfg_work); ++ queue_work(do_cfg_workqueue, &ctx_p->work); ++ atomic_inc(&vecls_worker_count); ++} ++ ++static void ethtool_cfg_rxcls(struct sock *sk, int is_del) ++{ ++ if (sk->sk_state != TCP_LISTEN) ++ return; ++ ++ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) ++ return; ++ ++ vecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:%pI4, port:%d\n", raw_smp_processor_id(), ++ current->comm, sk, is_del, &sk->sk_rcv_saddr, (u16)sk->sk_num); ++ ++ if (is_del) ++ del_ntuple_rule(sk); ++ else ++ add_ntuple_rule(sk); ++} ++ ++static void clean_vecls_sk_rules(void) ++{ ++ struct vecls_netdev_info *vecls_dev; ++ struct cmd_context ctx = { 0 }; ++ struct vecls_sk_rule *rule; ++ struct hlist_head *hlist; ++ struct hlist_node *n; ++ unsigned int i; ++ int err; ++ ++ mutex_lock(&vecls_sk_rules.mutex); ++ for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { ++ hlist = &vecls_sk_rules.hash[i]; ++ ++ hlist_for_each_entry_safe(rule, n, hlist, node) { ++ vecls_dev = get_vecls_netdev_info(rule->devid); ++ if (!vecls_dev) ++ continue; ++ strncpy(ctx.netdev, vecls_dev->dev_name, IFNAMSIZ); ++ ctx.del_ruleid = rule->ruleid; ++ err = cfg_ethtool_rule(&ctx, true); ++ vecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, ++ rule->devid, rule->action, rule->ruleid, err); ++ ++ hlist_del(&rule->node); ++ vecls_debug("clean rule=%p\n", rule); ++ kfree(rule); ++ } ++ } ++ mutex_unlock(&vecls_sk_rules.mutex); ++} ++ ++int venetcls_ntuple_status(struct seq_file *seq, void *v) ++{ ++ struct vecls_netdev_info *vecls_dev; ++ struct vecls_sk_rule *rule; ++ struct hlist_head *hlist; ++ struct hlist_node *n; ++ unsigned int i; ++ ++ seq_printf(seq, "%-16s %-12s %-8s %-6s %-6s %-6s\n", ++ "Interface", "dstIP", "dstPort", "rxq", "ruleId", "NumaID"); ++ mutex_lock(&vecls_sk_rules.mutex); ++ for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) { ++ hlist = &vecls_sk_rules.hash[i]; ++ hlist_for_each_entry_safe(rule, n, hlist, node) { ++ vecls_dev = get_vecls_netdev_info(rule->devid); ++ if (!vecls_dev) ++ continue; ++ seq_printf(seq, "%-16s %-12pI4 %-8d %-6d %-6d %-6d\n", ++ vecls_dev->dev_name, &rule->dip4, ntohs(rule->dport), ++ rule->action, rule->ruleid, rule->nid); ++ } ++ } ++ mutex_unlock(&vecls_sk_rules.mutex); ++ ++ return 0; ++} ++ ++static const struct vecls_hook_ops vecls_ntuple_ops = { ++ .vecls_flow_update = NULL, ++ .vecls_set_cpu = NULL, ++ .vecls_timeout = NULL, ++ .vecls_cfg_rxcls = ethtool_cfg_rxcls, ++}; ++ ++int vecls_ntuple_res_init(void) ++{ ++ do_cfg_workqueue = alloc_ordered_workqueue("vecls_cfg", 0); ++ if (!do_cfg_workqueue) { ++ vecls_debug("alloc_ordered_workqueue fails\n"); ++ return -ENOMEM; ++ } ++ ++ init_vecls_sk_rules(); ++ RCU_INIT_POINTER(vecls_ops, &vecls_ntuple_ops); ++ synchronize_rcu(); ++ return 0; ++} ++ ++void vecls_ntuple_res_clean(void) ++{ ++ RCU_INIT_POINTER(vecls_ops, NULL); ++ synchronize_rcu(); ++ ++ while (atomic_read(&vecls_worker_count) != 0) ++ mdelay(1); ++ destroy_workqueue(do_cfg_workqueue); ++ clean_vecls_sk_rules(); ++} +-- +2.20.1 + diff --git a/0006-block-support-to-dispatch-bio-asynchronously.patch b/0006-block-support-to-dispatch-bio-asynchronously.patch new file mode 100644 index 0000000000000000000000000000000000000000..4bb3791c8d066080f32ab562fa7603258ea286f2 --- /dev/null +++ b/0006-block-support-to-dispatch-bio-asynchronously.patch @@ -0,0 +1,513 @@ +From cafa19382531ab95d76e369712b0f7457d383597 Mon Sep 17 00:00:00 2001 +From: Li Nan +Date: Fri, 14 Jun 2024 11:44:06 +0800 +Subject: [PATCH] block: support to dispatch bio asynchronously + +In certain environments, specific CPUs handle a large number of tasks +and become bottlenecks, affecting overall system performance. This +commit introduces a new feature that enables asynchronous I/O dispatch +to designated CPUs, thereby relieving the pressure on the busy CPUs. + +Signed-off-by: Li Nan +Signed-off-by: Zizhi Wo +--- + block/Kconfig | 11 ++ + block/blk-core.c | 242 +++++++++++++++++++++++++++++++++++++- + block/blk-mq-debugfs.c | 1 + + block/blk-sysfs.c | 60 ++++++++++ + block/blk.h | 8 ++ + config.aarch64 | 1 + + config.x86_64 | 1 + + include/linux/blk_types.h | 1 + + include/linux/blkdev.h | 7 ++ + 9 files changed, 331 insertions(+), 1 deletion(-) + +diff --git a/block/Kconfig b/block/Kconfig +index c6ce41a5e..665a09a0a 100644 +--- a/block/Kconfig ++++ b/block/Kconfig +@@ -190,6 +190,17 @@ config BLK_INLINE_ENCRYPTION_FALLBACK + by falling back to the kernel crypto API when inline + encryption hardware is not present. + ++config BLK_BIO_DISPATCH_ASYNC ++ bool "Dispatch bios asynchronously on specific cpus" ++ default n ++ help ++ In certain environments, specific CPUs handle a large number of ++ tasks and become bottlenecks, affecting overall system ++ performance. This commit introduces a new feature that enables ++ asynchronous I/O dispatch to designated CPUs, thereby relieving ++ the pressure on the busy CPUs. ++ If unsure, say N. ++ + source "block/partitions/Kconfig" + + config BLOCK_COMPAT +diff --git a/block/blk-core.c b/block/blk-core.c +index 46a7049b8..3ce2baf7e 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -74,6 +74,236 @@ struct kmem_cache *blk_requestq_cachep; + */ + static struct workqueue_struct *kblockd_workqueue; + ++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC ++ ++#define BIO_DISPATCH_MAX_LOOP 16 ++ ++struct async_bio { ++ struct bio_list list; ++ spinlock_t lock; ++} ____cacheline_aligned_in_smp; ++ ++struct bio_dispatch_async_ctl { ++ /* ++ * Vector size is nr_cpu_ids, list stores bio dispatched from other cpu, ++ * such bio will be dispatched asynchronously to the cpu this structure ++ * is serviced. ++ */ ++ struct async_bio *bios; ++ /* kthread to handle bio dispatched from other cpu. */ ++ struct task_struct *thread; ++ wait_queue_head_t wait; ++}; ++ ++static struct bio_dispatch_async_ctl __percpu *bio_dispatch_async_ctl; ++ ++static int blk_alloc_queue_dispatch_async(struct request_queue *q) ++{ ++ int cpu; ++ ++ /* use the same function and parameters as alloc_cpumask_var() */ ++ q->dispatch_async_cpus = kmalloc_node(cpumask_size(), ++ GFP_KERNEL, q->node); ++ if (!q->dispatch_async_cpus) ++ return -ENOMEM; ++ ++ q->last_dispatch_cpu = alloc_percpu(int); ++ if (!q->last_dispatch_cpu) { ++ kfree(q->dispatch_async_cpus); ++ q->dispatch_async_cpus = NULL; ++ return -ENOMEM; ++ } ++ ++ cpumask_setall(q->dispatch_async_cpus); ++ for_each_possible_cpu(cpu) ++ *per_cpu_ptr(q->last_dispatch_cpu, cpu) = cpu; ++ ++ return 0; ++} ++ ++void blk_free_queue_dispatch_async(struct request_queue *q) ++{ ++ kfree(q->dispatch_async_cpus); ++ q->dispatch_async_cpus = NULL; ++ free_percpu(q->last_dispatch_cpu); ++ q->last_dispatch_cpu = NULL; ++} ++ ++static int get_dispatch_cpu(struct request_queue *q) ++{ ++ int cpu = cpumask_next(this_cpu_read(*q->last_dispatch_cpu), ++ q->dispatch_async_cpus); ++ ++ if (cpu >= nr_cpu_ids) ++ cpu = cpumask_first(q->dispatch_async_cpus); ++ ++ return cpu; ++} ++ ++static bool __submit_bio_noacct_async(struct bio *bio) ++{ ++ struct request_queue *q = bio->bi_bdev->bd_disk->queue; ++ int current_cpu = smp_processor_id(); ++ int dispatch_cpu = get_dispatch_cpu(q); ++ struct bio_dispatch_async_ctl *ctl; ++ ++ if (dispatch_cpu >= nr_cpu_ids) ++ return false; ++ ++ this_cpu_write(*q->last_dispatch_cpu, dispatch_cpu); ++ ++ ctl = per_cpu_ptr(bio_dispatch_async_ctl, dispatch_cpu); ++ spin_lock_irq(&ctl->bios[current_cpu].lock); ++ bio_list_add(&ctl->bios[current_cpu].list, bio); ++ spin_unlock_irq(&ctl->bios[current_cpu].lock); ++ ++ if (wq_has_sleeper(&ctl->wait)) ++ wake_up(&ctl->wait); ++ ++ return true; ++} ++ ++static bool submit_bio_noacct_async(struct bio *bio) ++{ ++ struct request_queue *q; ++ ++ if (bio_flagged(bio, BIO_ASYNC)) ++ return false; ++ ++ bio_set_flag(bio, BIO_ASYNC); ++ /* ++ * Don't dispatch bio asynchronously in following cases: ++ * ++ * - QUEUE_FLAG_DISPATCH_ASYNC is not set; ++ * - io polling is enabled; ++ * - current cpu is the target cpu; ++ * - bio is flagged no wait; ++ */ ++ q = bio->bi_bdev->bd_disk->queue; ++ if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags) || ++ test_bit(QUEUE_FLAG_POLL, &q->queue_flags) || ++ cpumask_test_cpu(smp_processor_id(), q->dispatch_async_cpus) || ++ bio->bi_opf & REQ_NOWAIT) ++ return false; ++ ++ return __submit_bio_noacct_async(bio); ++} ++ ++static bool collect_bio(struct bio_dispatch_async_ctl *ctl, ++ struct bio_list *list) ++{ ++ bool has_bio = false; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct async_bio *abio = &ctl->bios[cpu]; ++ ++ if (bio_list_empty(&abio->list)) ++ continue; ++ ++ has_bio = true; ++ ++ spin_lock_irq(&abio->lock); ++ bio_list_merge(list, &abio->list); ++ bio_list_init(&abio->list); ++ spin_unlock_irq(&abio->lock); ++ } ++ ++ return has_bio; ++} ++ ++static int bio_dispatch_work(void *data) ++{ ++ int loop_count = 0; ++ struct bio_list bio_list_on_stack; ++ struct blk_plug plug; ++ struct bio_dispatch_async_ctl *ctl; ++ ++ bio_list_init(&bio_list_on_stack); ++ ctl = this_cpu_ptr(bio_dispatch_async_ctl); ++ ++ for (;; loop_count++) { ++ struct bio *bio; ++ bool has_bio = collect_bio(ctl, &bio_list_on_stack); ++ ++ if (!has_bio) { ++ DEFINE_WAIT(wait); ++ ++ for (;;) { ++ prepare_to_wait(&ctl->wait, &wait, ++ TASK_INTERRUPTIBLE); ++ has_bio = collect_bio(ctl, &bio_list_on_stack); ++ if (has_bio) ++ break; ++ schedule(); ++ loop_count = 0; ++ } ++ finish_wait(&ctl->wait, &wait); ++ } ++ ++ blk_start_plug(&plug); ++ while ((bio = bio_list_pop(&bio_list_on_stack))) ++ submit_bio_noacct(bio); ++ blk_finish_plug(&plug); ++ ++ /* prevent soft lockup. */ ++ if (loop_count >= BIO_DISPATCH_MAX_LOOP) { ++ loop_count = 0; ++ cond_resched(); ++ } ++ } ++ ++ return 0; ++} ++ ++static void init_blk_queue_async_dispatch(void) ++{ ++ int cpu; ++ ++ bio_dispatch_async_ctl = alloc_percpu(struct bio_dispatch_async_ctl); ++ if (!bio_dispatch_async_ctl) ++ panic("Failed to alloc bio_dispatch_async_ctl\n"); ++ ++ for_each_possible_cpu(cpu) { ++ int i; ++ struct bio_dispatch_async_ctl *ctl = ++ per_cpu_ptr(bio_dispatch_async_ctl, cpu); ++ ++ init_waitqueue_head(&ctl->wait); ++ ctl->bios = kmalloc_array(nr_cpu_ids, sizeof(struct async_bio), ++ GFP_KERNEL); ++ if (!ctl->bios) ++ panic("Failed to alloc async bio array\n"); ++ for (i = 0; i < nr_cpu_ids; ++i) { ++ bio_list_init(&ctl->bios[i].list); ++ spin_lock_init(&ctl->bios[i].lock); ++ } ++ ++ ctl->thread = ++ kthread_create_on_cpu(bio_dispatch_work, NULL, cpu, ++ "bio_dispatch_work_%u"); ++ if (IS_ERR_OR_NULL(ctl->thread)) ++ panic("Failed to create bio dispatch thread\n"); ++ ++ wake_up_process(ctl->thread); ++ } ++} ++#else ++static int blk_alloc_queue_dispatch_async(struct request_queue *q) ++{ ++ return 0; ++} ++ ++static bool submit_bio_noacct_async(struct bio *bio) ++{ ++ return false; ++} ++ ++static void init_blk_queue_async_dispatch(void) ++{ ++} ++#endif ++ + /** + * blk_queue_flag_set - atomically set a queue flag + * @flag: flag to be set +@@ -499,9 +729,12 @@ struct request_queue *blk_alloc_queue(int node_id) + + q->last_merge = NULL; + ++ if (blk_alloc_queue_dispatch_async(q)) ++ goto fail_q; ++ + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); + if (q->id < 0) +- goto fail_q; ++ goto fail_dispatch_async; + + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); + if (ret) +@@ -553,6 +786,8 @@ struct request_queue *blk_alloc_queue(int node_id) + bioset_exit(&q->bio_split); + fail_id: + ida_simple_remove(&blk_queue_ida, q->id); ++fail_dispatch_async: ++ blk_free_queue_dispatch_async(q); + fail_q: + kmem_cache_free(blk_requestq_cachep, q); + return NULL; +@@ -963,6 +1198,9 @@ static void __submit_bio_noacct_mq(struct bio *bio) + */ + void submit_bio_noacct(struct bio *bio) + { ++ if (submit_bio_noacct_async(bio)) ++ return; ++ + /* + * We only want one ->submit_bio to be active at a time, else stack + * usage with stacked devices could be a problem. Use current->bio_list +@@ -1688,5 +1926,7 @@ int __init blk_dev_init(void) + + blk_debugfs_root = debugfs_create_dir("block", NULL); + ++ init_blk_queue_async_dispatch(); ++ + return 0; + } +diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c +index 4866d4f81..67957ce49 100644 +--- a/block/blk-mq-debugfs.c ++++ b/block/blk-mq-debugfs.c +@@ -131,6 +131,7 @@ static const char *const blk_queue_flag_name[] = { + QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(HCTX_ACTIVE), + QUEUE_FLAG_NAME(NOWAIT), ++ QUEUE_FLAG_NAME(DISPATCH_ASYNC), + }; + #undef QUEUE_FLAG_NAME + +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index 725530f13..e0ef894b9 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -304,6 +304,9 @@ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); + QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); + QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); + QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); ++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC ++QUEUE_SYSFS_BIT_FNS(dispatch_async, DISPATCH_ASYNC, 0); ++#endif + #undef QUEUE_SYSFS_BIT_FNS + + static ssize_t queue_zoned_show(struct request_queue *q, char *page) +@@ -625,6 +628,57 @@ QUEUE_RW_ENTRY(queue_iostats, "iostats"); + QUEUE_RW_ENTRY(queue_random, "add_random"); + QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); + ++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC ++ ++static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q, ++ char *page) ++{ ++ return sprintf(page, "%*pb\n", nr_cpu_ids, ++ cpumask_bits(q->dispatch_async_cpus)); ++} ++ ++static ssize_t queue_dispatch_async_cpus_store(struct request_queue *q, ++ const char *page, size_t count) ++{ ++ cpumask_var_t cpumask; ++ ssize_t ret; ++ ++ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = bitmap_parse(page, count, cpumask_bits(cpumask), ++ nr_cpumask_bits); ++ if (ret < 0) ++ goto out; ++ ++ if (cpumask_empty(cpumask) || ++ !cpumask_subset(cpumask, cpu_online_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ blk_mq_freeze_queue(q); ++ blk_mq_quiesce_queue(q); ++ ++ cpumask_copy(q->dispatch_async_cpus, cpumask); ++ ++ blk_mq_unquiesce_queue(q); ++ blk_mq_unfreeze_queue(q); ++ ret = count; ++out: ++ free_cpumask_var(cpumask); ++ return ret; ++} ++ ++static struct queue_sysfs_entry queue_dispatch_async_cpus_entry = { ++ .attr = {.name = "dispatch_async_cpus", .mode = 0644 }, ++ .show = queue_dispatch_async_cpus_show, ++ .store = queue_dispatch_async_cpus_store, ++}; ++ ++QUEUE_RW_ENTRY(queue_dispatch_async, "dispatch_async"); ++#endif ++ + static struct attribute *queue_attrs[] = { + &queue_requests_entry.attr, + &queue_ra_entry.attr, +@@ -666,6 +720,10 @@ static struct attribute *queue_attrs[] = { + &queue_wb_lat_entry.attr, + &queue_poll_delay_entry.attr, + &queue_io_timeout_entry.attr, ++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC ++ &queue_dispatch_async_cpus_entry.attr, ++ &queue_dispatch_async_entry.attr, ++#endif + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW + &blk_throtl_sample_time_entry.attr, + #endif +@@ -773,6 +831,8 @@ static void blk_release_queue(struct kobject *kobj) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); + ++ blk_free_queue_dispatch_async(q); ++ + blk_free_queue_stats(q->stats); + + blk_queue_free_zone_bitmaps(q); +diff --git a/block/blk.h b/block/blk.h +index e80350327..6a7bad9f8 100644 +--- a/block/blk.h ++++ b/block/blk.h +@@ -454,4 +454,12 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); + + extern const struct address_space_operations def_blk_aops; + ++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC ++void blk_free_queue_dispatch_async(struct request_queue *q); ++#else ++static inline void blk_free_queue_dispatch_async(struct request_queue *q) ++{ ++} ++#endif ++ + #endif /* BLK_INTERNAL_H */ +diff --git a/config.aarch64 b/config.aarch64 +index 998d7fb5b..234d15966 100644 +--- a/config.aarch64 ++++ b/config.aarch64 +@@ -818,6 +818,7 @@ CONFIG_BLK_DEBUG_FS=y + CONFIG_BLK_DEBUG_FS_ZONED=y + CONFIG_BLK_SED_OPAL=y + # CONFIG_BLK_INLINE_ENCRYPTION is not set ++CONFIG_BLK_BIO_DISPATCH_ASYNC=y + + # + # Partition Types +diff --git a/config.x86_64 b/config.x86_64 +index 8d9500329..e5908b4a5 100644 +--- a/config.x86_64 ++++ b/config.x86_64 +@@ -861,6 +861,7 @@ CONFIG_BLK_DEBUG_FS=y + CONFIG_BLK_DEBUG_FS_ZONED=y + CONFIG_BLK_SED_OPAL=y + # CONFIG_BLK_INLINE_ENCRYPTION is not set ++CONFIG_BLK_BIO_DISPATCH_ASYNC=y + + # + # Partition Types +diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h +index fa78cbf26..ba37b97cb 100644 +--- a/include/linux/blk_types.h ++++ b/include/linux/blk_types.h +@@ -303,6 +303,7 @@ enum { + BIO_REMAPPED, + BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */ ++ BIO_ASYNC, /* has been dispatched asynchronously */ + BIO_FLAG_LAST + }; + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index dfd4b9361..18dc3f950 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -337,6 +337,12 @@ struct request_queue { + + bool mq_sysfs_init_done; + ++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC ++ /* used when QUEUE_FLAG_DISPATCH_ASYNC is set */ ++ struct cpumask *dispatch_async_cpus; ++ int __percpu *last_dispatch_cpu; ++#endif ++ + #define BLK_MAX_WRITE_HINTS 5 + u64 write_hints[BLK_MAX_WRITE_HINTS]; + }; +@@ -372,6 +378,7 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ + #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ ++#define QUEUE_FLAG_DISPATCH_ASYNC 30 /* support to dispatch bio asynchronously */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_SAME_COMP) | \ +-- +2.20.1 + diff --git a/0007-sched-fair-Prefer-physical-cores-when-migrating-task.patch b/0007-sched-fair-Prefer-physical-cores-when-migrating-task.patch new file mode 100644 index 0000000000000000000000000000000000000000..946313c66f1291268e6f1c70c186a8850803badd --- /dev/null +++ b/0007-sched-fair-Prefer-physical-cores-when-migrating-task.patch @@ -0,0 +1,218 @@ +From ebc4f8fa4841f245d81f83832532e2206af4f8fc Mon Sep 17 00:00:00 2001 +From: Cheng Yu +Date: Mon, 12 Aug 2024 20:40:25 +0800 +Subject: [PATCH] sched/fair: Prefer physical cores when migrating tasks + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IAJEHU +CVE: NA + +-------------------------------- + +When cpu hyperthreading is enabled, one physical core can virtualize +multiple logical cpus. Assume that physical core0 virtualizes two +logical cpus, cpu0 and cpu1. Only when the load of cpu0 exceeds the set +ratio to the capacity of cpu0, the task will be migrated to the cpu1, +otherwise the task will not be migrated and the cpu0 will still be used. + +Signed-off-by: Cheng Yu +Signed-off-by: Liu Jian +--- + arch/arm64/Kconfig | 1 + + config.aarch64 | 1 + + config.aarch64-64k | 1 + + include/linux/sched/sysctl.h | 4 ++++ + init/Kconfig | 18 ++++++++++++++++++ + kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 4 ++++ + kernel/sysctl.c | 12 ++++++++++++ + 8 files changed, 75 insertions(+) + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index eef487d36..31eaa7775 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -90,6 +90,7 @@ config ARM64 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +diff --git a/config.aarch64 b/config.aarch64 +index 234d15966..fcff88edb 100644 +--- a/config.aarch64 ++++ b/config.aarch64 +@@ -170,6 +170,7 @@ CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y + CONFIG_NET_NS=y ++CONFIG_SCHED_KEEP_ON_CORE=y + CONFIG_CHECKPOINT_RESTORE=y + CONFIG_SCHED_AUTOGROUP=y + # CONFIG_SYSFS_DEPRECATED is not set +diff --git a/config.aarch64-64k b/config.aarch64-64k +index 5cce0103e..41daa7820 100644 +--- a/config.aarch64-64k ++++ b/config.aarch64-64k +@@ -170,6 +170,7 @@ CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y + CONFIG_NET_NS=y ++CONFIG_SCHED_KEEP_ON_CORE=y + CONFIG_CHECKPOINT_RESTORE=y + CONFIG_SCHED_AUTOGROUP=y + # CONFIG_SYSFS_DEPRECATED is not set +diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h +index 304f43117..da02869a4 100644 +--- a/include/linux/sched/sysctl.h ++++ b/include/linux/sched/sysctl.h +@@ -28,6 +28,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; + + extern unsigned int sysctl_sched_child_runs_first; + ++#ifdef CONFIG_SCHED_KEEP_ON_CORE ++extern int sysctl_sched_util_ratio; ++#endif ++ + enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, +diff --git a/init/Kconfig b/init/Kconfig +index 52167947f..896773c70 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1282,6 +1282,24 @@ config NET_NS + + endif # NAMESPACES + ++# For architectures that want to enable the support for SCHED_KEEP_ON_CORE ++# ++config ARCH_SUPPORTS_SCHED_KEEP_ON_CORE ++ bool ++ ++config SCHED_KEEP_ON_CORE ++ bool "Prefer physical cores when migrating tasks" ++ depends on ARCH_SUPPORTS_SCHED_KEEP_ON_CORE ++ depends on SCHED_SMT ++ default n ++ help ++ When cpu hyperthreading is enabled, one physical core can virtualize ++ multiple logical cpus. Assume that physical core0 virtualizes two ++ logical cpus, cpu0 and cpu1. Only when the load of cpu0 exceeds the ++ ratio to the capacity of cpu0, the task will be migrated to the cpu1, ++ otherwise the task will not be migrated and the cpu0 will still be ++ used. ++ + config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" + select PROC_CHILDREN +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index f4db97423..b9e3a63ad 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6487,6 +6487,22 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t + return si_cpu; + } + ++#ifdef CONFIG_SCHED_KEEP_ON_CORE ++int sysctl_sched_util_ratio = 100; ++ ++static bool core_has_spare(int cpu) ++{ ++ int core_id = cpumask_first(cpu_smt_mask(cpu)); ++ unsigned long util = cpu_util(core_id); ++ unsigned long capacity = capacity_of(core_id); ++ ++ if (sysctl_sched_util_ratio == 100) ++ return true; ++ ++ return util * 100 < capacity * sysctl_sched_util_ratio; ++} ++#endif ++ + #else /* CONFIG_SCHED_SMT */ + + static inline void set_idle_cores(int cpu, int val) +@@ -7365,6 +7381,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + /* Fast path */ + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + } ++ ++#ifdef CONFIG_SCHED_KEEP_ON_CORE ++ if (sched_feat(KEEP_ON_CORE) && ++ static_branch_likely(&sched_smt_present)) { ++ if (core_has_spare(new_cpu)) ++ new_cpu = cpumask_first(cpu_smt_mask((new_cpu))); ++ } ++#endif ++ + rcu_read_unlock(); + + return new_cpu; +@@ -8187,6 +8212,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + + lockdep_assert_rq_held(env->src_rq); + ++#ifdef CONFIG_SCHED_KEEP_ON_CORE ++ if (sched_feat(KEEP_ON_CORE) && ++ static_branch_likely(&sched_smt_present)) { ++ if (core_has_spare(env->dst_cpu) && ++ cpumask_first(cpu_smt_mask((env->dst_cpu))) != env->dst_cpu) ++ return 0; ++ } ++#endif ++ + /* + * We do not migrate tasks that are: + * 1) throttled_lb_pair, or +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 8c1d34adc..9d0d29a12 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -58,6 +58,10 @@ SCHED_FEAT(TTWU_QUEUE, true) + SCHED_FEAT(SIS_PROP, false) + SCHED_FEAT(SIS_UTIL, true) + ++#ifdef CONFIG_SCHED_KEEP_ON_CORE ++SCHED_FEAT(KEEP_ON_CORE, false) ++#endif ++ + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 32e59f230..5299337f8 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -118,6 +118,7 @@ static int sixty = 60; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; ++static int one_hundred = 100; + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -2746,6 +2747,17 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, ++#ifdef CONFIG_SCHED_KEEP_ON_CORE ++ { ++ .procname = "sched_util_ratio", ++ .data = &sysctl_sched_util_ratio, ++ .maxlen = sizeof(sysctl_sched_util_ratio), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &one_hundred, ++ }, ++#endif + { } + }; + +-- +2.20.1 +