diff --git a/0001-io_uring-io-wq-stop-setting-PF_NO_SETAFFINITY-on-io-.patch b/0001-io_uring-io-wq-stop-setting-PF_NO_SETAFFINITY-on-io-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..29f02709d2d980e7ef7920da685b08952ab8147b
--- /dev/null
+++ b/0001-io_uring-io-wq-stop-setting-PF_NO_SETAFFINITY-on-io-.patch
@@ -0,0 +1,81 @@
+From c226e8f63fd32b161bba9b37ed525b8228b47b08 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 8 Mar 2023 07:18:51 -0700
+Subject: [PATCH] io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq
+ workers
+
+commit 01e68ce08a30db3d842ce7a55f7f6e0474a55f9a
+
+Every now and then reports come in that are puzzled on why changing
+affinity on the io-wq workers fails with EINVAL. This happens because they
+set PF_NO_SETAFFINITY as part of their creation, as io-wq organizes
+workers into groups based on what CPU they are running on.
+
+However, this is purely an optimization and not a functional requirement.
+We can allow setting affinity, and just lazily update our worker to wqe
+mappings. If a given io-wq thread times out, it normally exits if there's
+no more work to do. The exception is if it's the last worker available.
+For the timeout case, check the affinity of the worker against group mask
+and exit even if it's the last worker. New workers should be created with
+the right mask and in the right location.
+
+Reported-by :Daniel Dao <dqminh@cloudflare.com>
+Link: https://lore.kernel.org/io-uring/CA+wXwBQwgxB3_UphSny-yAP5b26meeOu1W4TwYVcD_+5gOhvPw@mail.gmail.com/
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
+Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
+---
+ io_uring/io-wq.c | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
+index 3970e8047..adc39385e 100644
+--- a/io_uring/io-wq.c
++++ b/io_uring/io-wq.c
+@@ -628,7 +628,7 @@ static int io_wqe_worker(void *data)
+ 	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ 	struct io_wqe *wqe = worker->wqe;
+ 	struct io_wq *wq = wqe->wq;
+-	bool last_timeout = false;
++	bool exit_mask = false, last_timeout = false;
+ 	char buf[TASK_COMM_LEN];
+ 
+ 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+@@ -644,8 +644,11 @@ static int io_wqe_worker(void *data)
+ 			io_worker_handle_work(worker);
+ 
+ 		raw_spin_lock(&wqe->lock);
+-		/* timed out, exit unless we're the last worker */
+-		if (last_timeout && acct->nr_workers > 1) {
++		/*
++		 * Last sleep timed out. Exit if we're not the last worker,
++		 * or if someone modified our affinity.
++		 */
++		if (last_timeout && (exit_mask || acct->nr_workers > 1)) {
+ 			acct->nr_workers--;
+ 			raw_spin_unlock(&wqe->lock);
+ 			__set_current_state(TASK_RUNNING);
+@@ -664,7 +667,11 @@ static int io_wqe_worker(void *data)
+ 				continue;
+ 			break;
+ 		}
+-		last_timeout = !ret;
++		if (!ret) {
++			last_timeout = true;
++			exit_mask = !cpumask_test_cpu(raw_smp_processor_id(),
++							wqe->cpu_mask);
++		}
+ 	}
+ 
+ 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
+@@ -716,7 +723,6 @@ static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
+ 	tsk->worker_private = worker;
+ 	worker->task = tsk;
+ 	set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
+-	tsk->flags |= PF_NO_SETAFFINITY;
+ 
+ 	raw_spin_lock(&wqe->lock);
+ 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+-- 
+2.20.1
+
diff --git a/0002-io_uring-sqpoll-Do-not-set-PF_NO_SETAFFINITY-on-sqpo.patch b/0002-io_uring-sqpoll-Do-not-set-PF_NO_SETAFFINITY-on-sqpo.patch
new file mode 100644
index 0000000000000000000000000000000000000000..363c7d523daf75cbf7d1bc85d5447ebd1382f7b3
--- /dev/null
+++ b/0002-io_uring-sqpoll-Do-not-set-PF_NO_SETAFFINITY-on-sqpo.patch
@@ -0,0 +1,51 @@
+From 00217e9a015b1d2fea9deccd0311be4758cc0d7e Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
+Date: Mon, 26 May 2025 14:38:04 +0800
+Subject: [PATCH] io_uring/sqpoll: Do not set PF_NO_SETAFFINITY on sqpoll
+ threads
+
+commit a5fc1441af7719e93dc7a638a960befb694ade89
+
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+mainline inclusion
+from mainline-v6.3-rc3
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IC6ES1
+
+--------------------------------
+
+Users may specify a CPU where the sqpoll thread would run. This may
+conflict with cpuset operations because of strict PF_NO_SETAFFINITY
+requirement. That flag is unnecessary for polling "kernel" threads, see
+the reasoning in commit 01e68ce08a30 ("io_uring/io-wq: stop setting
+PF_NO_SETAFFINITY on io-wq workers"). Drop the flag on poll threads too.
+
+Fixes: 01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers")
+Link: https://lore.kernel.org/all/20230314162559.pnyxdllzgw7jozgx@blackpad/
+Signed-off-by: Michal Koutný <mkoutny@suse.com>
+Link: https://lore.kernel.org/r/20230314183332.25834-1-mkoutny@suse.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
+Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
+---
+ io_uring/sqpoll.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
+index a0bb6142a..d152108c5 100644
+--- a/io_uring/sqpoll.c
++++ b/io_uring/sqpoll.c
+@@ -239,7 +239,6 @@ static int io_sq_thread(void *data)
+ 		set_cpus_allowed_ptr(current, cpu_online_mask);
+ 		sqd->sq_cpu = raw_smp_processor_id();
+ 	}
+-	current->flags |= PF_NO_SETAFFINITY;
+ 
+ 	/*
+ 	 * Force audit context to get setup, in case we do prep side async
+-- 
+2.20.1
+
diff --git a/0003-io_uring-Support-forcing-sq-thread-to-be-idle-and-wo.patch b/0003-io_uring-Support-forcing-sq-thread-to-be-idle-and-wo.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f33721519a710b66f63393e1bbc21d53f9d7f09f
--- /dev/null
+++ b/0003-io_uring-Support-forcing-sq-thread-to-be-idle-and-wo.patch
@@ -0,0 +1,296 @@
+From a2505e287bd260f975786e72e56f22165ae25fe5 Mon Sep 17 00:00:00 2001
+From: ChenZhen <chenzhen126@huawei.com>
+Date: Fri, 11 Jul 2025 11:22:11 +0800
+Subject: [PATCH] io_uring: Support forcing sq thread to be idle and woken up
+ by hrtimer
+
+This patch adds one option IORING_SETUP_SQ_THREAD_IDLE for io_uring user
+program to reduce cpu usage of sq thread.
+When enabled, sq polling thread will try to be idle. a hrtimer will be
+created to wake up the sq thread periodically, the period can be set by
+io_uring_params.sq_thread_wakeup_period(unit: us, default 10ms).
+
+Signed-off-by: ChenZhen <chenzhen126@huawei.com>
+---
+ include/linux/io_uring_types.h |  2 ++
+ include/uapi/linux/io_uring.h  | 18 ++++++++++
+ io_uring/io_uring.c            | 24 +++++++++++--
+ io_uring/sqpoll.c              | 63 ++++++++++++++++++++++++++++++++--
+ io_uring/sqpoll.h              |  5 ++-
+ 5 files changed, 105 insertions(+), 7 deletions(-)
+
+diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
+index 37aeea266ebb..93a68a50405d 100644
+--- a/include/linux/io_uring_types.h
++++ b/include/linux/io_uring_types.h
+@@ -197,6 +197,7 @@ struct io_ring_ctx {
+ 
+ 		struct io_rings		*rings;
+ 		unsigned int		flags;
++		unsigned int		ext_flags;
+ 		enum task_work_notify_mode	notify_method;
+ 		unsigned int		compat: 1;
+ 		unsigned int		drain_next: 1;
+@@ -350,6 +351,7 @@ struct io_ring_ctx {
+ 
+ 	struct list_head		defer_list;
+ 	unsigned			sq_thread_idle;
++	ktime_t				sq_thread_wakeup_period;
+ 	/* protected by ->completion_lock */
+ 	unsigned			evfd_last_cq_tail;
+ };
+diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
+index 434f62e0fb72..e011a5f16481 100644
+--- a/include/uapi/linux/io_uring.h
++++ b/include/uapi/linux/io_uring.h
+@@ -172,6 +172,13 @@ enum {
+  * try to do it just before it is needed.
+  */
+ #define IORING_SETUP_DEFER_TASKRUN	(1U << 13)
++#define IORING_SETUP_EXT_PARAM		(1U << 31) /* extended param */
++
++/*
++ * io_uring_setup() extended flags
++ */
++/* Force SQ thread to be idle, waiting for periodic wake-up */
++#define IORING_SETUP_SQ_THREAD_FORCE_IDLE	(1U << 0)
+ 
+ enum io_uring_op {
+ 	IORING_OP_NOP,
+@@ -454,6 +461,17 @@ struct io_uring_params {
+ 	struct io_cqring_offsets cq_off;
+ };
+ 
++struct io_uring_params_ext {
++	__u32 flags;
++	__u32 sq_thread_wakeup_period;
++	__u32 resv[6];
++};
++
++struct io_uring_params_full {
++	struct io_uring_params p;
++	struct io_uring_params_ext ext_p;
++};
++
+ /*
+  * io_uring_params->features flags
+  */
+diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
+index 353c35987b06..ce7b5f17e4cf 100644
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -3455,6 +3455,7 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
+ }
+ 
+ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
++				  struct io_uring_params_ext *ext_p,
+ 				  struct io_uring_params __user *params)
+ {
+ 	struct io_ring_ctx *ctx;
+@@ -3512,6 +3513,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ 	    !(ctx->flags & IORING_SETUP_SQPOLL))
+ 		ctx->syscall_iopoll = 1;
+ 
++	ctx->ext_flags = ext_p->flags;
+ 	ctx->compat = in_compat_syscall();
+ 	if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
+ 		ctx->user = get_uid(current_user());
+@@ -3560,7 +3562,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ 	if (ret)
+ 		goto err;
+ 
+-	ret = io_sq_offload_create(ctx, p);
++	ret = io_sq_offload_create(ctx, p, ext_p);
+ 	if (ret)
+ 		goto err;
+ 	/* always set a rsrc node */
+@@ -3636,6 +3638,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+ {
+ 	struct io_uring_params p;
++	struct io_uring_params_ext ext_p = {};
+ 	int i;
+ 
+ 	if (copy_from_user(&p, params, sizeof(p)))
+@@ -3651,10 +3654,25 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+ 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
+ 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
+ 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
+-			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
++			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
++			IORING_SETUP_EXT_PARAM))
+ 		return -EINVAL;
+ 
+-	return io_uring_create(entries, &p, params);
++	if (p.flags & IORING_SETUP_EXT_PARAM) {
++		if (copy_from_user(&ext_p, (void __user *)params +
++				   offsetof(struct io_uring_params_full, ext_p),
++				   sizeof(ext_p)))
++			return -EFAULT;
++		for (i = 0; i < ARRAY_SIZE(ext_p.resv); i++) {
++			if (ext_p.resv[i])
++				return -EINVAL;
++		}
++
++		if (ext_p.flags & ~(IORING_SETUP_SQ_THREAD_FORCE_IDLE))
++			return -EINVAL;
++	}
++
++	return io_uring_create(entries, &p, &ext_p, params);
+ }
+ 
+ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
+index a0bb6142afbb..049fee9ae8da 100644
+--- a/io_uring/sqpoll.c
++++ b/io_uring/sqpoll.c
+@@ -84,6 +84,25 @@ static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+ 	sqd->sq_thread_idle = sq_thread_idle;
+ }
+ 
++static __cold void io_sqd_update_wakeup_period(struct io_sq_data *sqd)
++{
++	struct io_ring_ctx *ctx;
++	ktime_t sq_thread_wakeup_period = 0;
++
++	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
++		if (!(ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE))
++			continue;
++
++		if (!sq_thread_wakeup_period) {
++			sq_thread_wakeup_period = ctx->sq_thread_wakeup_period;
++			continue;
++		}
++		sq_thread_wakeup_period = min(sq_thread_wakeup_period,
++					      ctx->sq_thread_wakeup_period);
++	}
++	WRITE_ONCE(sqd->sq_thread_wakeup_period, sq_thread_wakeup_period);
++}
++
+ void io_sq_thread_finish(struct io_ring_ctx *ctx)
+ {
+ 	struct io_sq_data *sqd = ctx->sq_data;
+@@ -92,6 +111,9 @@ void io_sq_thread_finish(struct io_ring_ctx *ctx)
+ 		io_sq_thread_park(sqd);
+ 		list_del_init(&ctx->sqd_list);
+ 		io_sqd_update_thread_idle(sqd);
++		io_sqd_update_wakeup_period(sqd);
++		if (!sqd->sq_thread_wakeup_period)
++			hrtimer_cancel(&sqd->timer);
+ 		io_sq_thread_unpark(sqd);
+ 
+ 		io_put_sq_data(sqd);
+@@ -156,6 +178,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
+ 	mutex_init(&sqd->lock);
+ 	init_waitqueue_head(&sqd->wait);
+ 	init_completion(&sqd->exited);
++	hrtimer_init(&sqd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ 	return sqd;
+ }
+ 
+@@ -251,7 +274,7 @@ static int io_sq_thread(void *data)
+ 
+ 	mutex_lock(&sqd->lock);
+ 	while (1) {
+-		bool cap_entries, sqt_spin = false;
++		bool cap_entries, sqt_spin = false, force_idle = false;
+ 
+ 		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
+ 			if (io_sqd_handle_event(sqd))
+@@ -263,13 +286,18 @@ static int io_sq_thread(void *data)
+ 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ 			int ret = __io_sq_thread(ctx, cap_entries);
+ 
++			if (ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE) {
++				force_idle = true;
++				continue;
++			}
++
+ 			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
+ 				sqt_spin = true;
+ 		}
+ 		if (io_run_task_work())
+ 			sqt_spin = true;
+ 
+-		if (sqt_spin || !time_after(jiffies, timeout)) {
++		if (!force_idle && (sqt_spin || !time_after(jiffies, timeout))) {
+ 			if (sqt_spin)
+ 				timeout = jiffies + sqd->sq_thread_idle;
+ 			if (unlikely(need_resched())) {
+@@ -350,8 +378,21 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
+ 	return 0;
+ }
+ 
++static enum hrtimer_restart sq_thread_hrtimer_fn(struct hrtimer *timer)
++{
++	struct io_sq_data *sqd = container_of(timer, struct io_sq_data, timer);
++	ktime_t sq_thread_wakeup_period;
++
++	sq_thread_wakeup_period = READ_ONCE(sqd->sq_thread_wakeup_period);
++	wake_up(&sqd->wait);
++	hrtimer_forward_now(timer, sq_thread_wakeup_period);
++
++	return HRTIMER_RESTART;
++}
++
+ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+-				struct io_uring_params *p)
++				struct io_uring_params *p,
++				struct io_uring_params_ext *ext_p)
+ {
+ 	int ret;
+ 
+@@ -390,9 +431,19 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ 		if (!ctx->sq_thread_idle)
+ 			ctx->sq_thread_idle = HZ;
+ 
++		if (ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE) {
++			/* reset to make this ctx skip updating sq thread idle */
++			ctx->sq_thread_idle = 0;
++			ctx->sq_thread_wakeup_period =
++				ns_to_ktime((u64)ext_p->sq_thread_wakeup_period * NSEC_PER_USEC);
++			if (!ctx->sq_thread_wakeup_period)
++				ctx->sq_thread_wakeup_period = ns_to_ktime(10 * NSEC_PER_MSEC);
++		}
++
+ 		io_sq_thread_park(sqd);
+ 		list_add(&ctx->sqd_list, &sqd->ctx_list);
+ 		io_sqd_update_thread_idle(sqd);
++		io_sqd_update_wakeup_period(sqd);
+ 		/* don't attach to a dying SQPOLL thread, would be racy */
+ 		ret = (attached && !sqd->thread) ? -ENXIO : 0;
+ 		io_sq_thread_unpark(sqd);
+@@ -426,6 +477,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ 		wake_up_new_task(tsk);
+ 		if (ret)
+ 			goto err;
++
++		if (ctx->ext_flags & IORING_SETUP_SQ_THREAD_FORCE_IDLE) {
++			sqd->timer.function = sq_thread_hrtimer_fn;
++			hrtimer_start(&sqd->timer, READ_ONCE(sqd->sq_thread_wakeup_period),
++				HRTIMER_MODE_REL);
++		}
+ 	} else if (p->flags & IORING_SETUP_SQ_AFF) {
+ 		/* Can't have SQ_AFF without SQPOLL */
+ 		ret = -EINVAL;
+diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
+index 36245f1afa5e..f8bbbc336ae6 100644
+--- a/io_uring/sqpoll.h
++++ b/io_uring/sqpoll.h
+@@ -18,9 +18,12 @@ struct io_sq_data {
+ 
+ 	unsigned long		state;
+ 	struct completion	exited;
++	ktime_t			sq_thread_wakeup_period;
++	struct hrtimer		timer;
+ };
+ 
+-int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
++int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p,
++			 struct io_uring_params_ext *ext_p);
+ void io_sq_thread_finish(struct io_ring_ctx *ctx);
+ void io_sq_thread_stop(struct io_sq_data *sqd);
+ void io_sq_thread_park(struct io_sq_data *sqd);
+-- 
+2.33.0
+
diff --git a/0004-sched-support-soft-domain.patch b/0004-sched-support-soft-domain.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7302e017a65732c3310d6fd4adc5ac9fe8a24355
--- /dev/null
+++ b/0004-sched-support-soft-domain.patch
@@ -0,0 +1,1086 @@
+From 168595cbb0852fc92e18e960309da88bddd40a3a Mon Sep 17 00:00:00 2001
+From: Zhang Qiao <zhangqiao22@huawei.com>
+Date: Mon, 30 Jun 2025 15:10:48 +0800
+Subject: [PATCH] sched: support soft domain
+
+On Kunpeng server, each LLC domain contains multiple clusters. When
+multiple services are deployed within the same LLC domain, their tasks
+become distributed across all clusters. This results in:
+
+1. High cache synchronization overheadbetween different tasks of the
+ same service.
+2. Severe cache contention among tasks from different services.
+
+The Soft Domain architecture partitions resources by clusters. Under
+low-load conditions, each service operates exclusively within its
+dedicated domain to prevent cross-service interference, thereby
+enhancing both CPU isolation and improving cache locality.
+
+Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
+---
+ config.aarch64                 |   2 +
+ config.aarch64-64k             |   2 +
+ config.x86_64                  |   1 +
+ include/linux/sched/topology.h |  21 ++
+ init/Kconfig                   |  12 +
+ kernel/sched/Makefile          |   1 +
+ kernel/sched/core.c            |  86 ++++++
+ kernel/sched/fair.c            | 146 +++++++++-
+ kernel/sched/features.h        |   4 +
+ kernel/sched/sched.h           |  41 +++
+ kernel/sched/soft_domain.c     | 500 +++++++++++++++++++++++++++++++++
+ 11 files changed, 815 insertions(+), 1 deletion(-)
+ create mode 100644 kernel/sched/soft_domain.c
+
+diff --git a/config.aarch64 b/config.aarch64
+index fcff88edb..51b199bc0 100644
+--- a/config.aarch64
++++ b/config.aarch64
+@@ -150,6 +150,7 @@ CONFIG_CGROUP_SCHED=y
+ CONFIG_FAIR_GROUP_SCHED=y
+ CONFIG_CFS_BANDWIDTH=y
+ # CONFIG_RT_GROUP_SCHED is not set
++CONFIG_SCHED_SOFT_DOMAIN=y
+ CONFIG_CGROUP_PIDS=y
+ CONFIG_CGROUP_RDMA=y
+ CONFIG_CGROUP_FREEZER=y
+@@ -383,6 +384,7 @@ CONFIG_ARM64_PA_BITS=48
+ # CONFIG_CPU_BIG_ENDIAN is not set
+ CONFIG_CPU_LITTLE_ENDIAN=y
+ CONFIG_SCHED_MC=y
++CONFIG_SCHED_CLUSTER=y
+ CONFIG_SCHED_SMT=y
+ CONFIG_NR_CPUS=1024
+ CONFIG_HOTPLUG_CPU=y
+diff --git a/config.aarch64-64k b/config.aarch64-64k
+index 41daa7820..cb0999f6c 100644
+--- a/config.aarch64-64k
++++ b/config.aarch64-64k
+@@ -150,6 +150,7 @@ CONFIG_CGROUP_SCHED=y
+ CONFIG_FAIR_GROUP_SCHED=y
+ CONFIG_CFS_BANDWIDTH=y
+ # CONFIG_RT_GROUP_SCHED is not set
++CONFIG_SCHED_SOFT_DOMAIN=y
+ CONFIG_CGROUP_PIDS=y
+ CONFIG_CGROUP_RDMA=y
+ CONFIG_CGROUP_FREEZER=y
+@@ -385,6 +386,7 @@ CONFIG_ARM64_PA_BITS=48
+ # CONFIG_CPU_BIG_ENDIAN is not set
+ CONFIG_CPU_LITTLE_ENDIAN=y
+ CONFIG_SCHED_MC=y
++CONFIG_SCHED_CLUSTER=y
+ CONFIG_SCHED_SMT=y
+ CONFIG_NR_CPUS=1024
+ CONFIG_HOTPLUG_CPU=y
+diff --git a/config.x86_64 b/config.x86_64
+index e5908b4a5..2b2506c45 100644
+--- a/config.x86_64
++++ b/config.x86_64
+@@ -172,6 +172,7 @@ CONFIG_CGROUP_SCHED=y
+ CONFIG_FAIR_GROUP_SCHED=y
+ CONFIG_CFS_BANDWIDTH=y
+ # CONFIG_RT_GROUP_SCHED is not set
++# CONFIG_SCHED_SOFT_DOMAIN is not set
+ CONFIG_CGROUP_PIDS=y
+ # CONFIG_CGROUP_RDMA is not set
+ CONFIG_CGROUP_FREEZER=y
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index ce703cae4..01f855335 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -77,6 +77,27 @@ extern int sched_domain_level_max;
+ 
+ struct sched_group;
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++
++struct soft_subdomain {
++	/* the count of task group attached this sub domain. */
++	int			attached;
++	struct list_head	node;
++	unsigned long		span[];
++};
++
++/*
++ * Each LLC builds a soft domain:
++ * A soft scheduling domain is divided into multiple subdomains,
++ * typically based on the physical structure of CPU clusters.
++ */
++struct soft_domain {
++	struct list_head	child_domain;
++	int			nr_available_cpus;
++	unsigned long		span[];
++};
++#endif
++
+ struct sched_domain_shared {
+ 	atomic_t	ref;
+ 	atomic_t	nr_busy_cpus;
+diff --git a/init/Kconfig b/init/Kconfig
+index dabb28d8a..896773c70 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1033,6 +1033,18 @@ config RT_GROUP_SCHED
+ 
+ endif #CGROUP_SCHED
+ 
++config SCHED_SOFT_DOMAIN
++	bool "Soft domain scheduler"
++	depends on FAIR_GROUP_SCHED
++	depends on SCHED_CLUSTER
++	default n
++	help
++	  This feature builds a CPU soft domain for each task group. Tasks are
++	  prioritized and aggregated to execute within soft domains, which optimizes
++	  resource allocation and enhances cache locality.
++
++	  If in doubt, say N.
++
+ config UCLAMP_TASK_GROUP
+ 	bool "Utilization clamping per group of tasks"
+ 	depends on CGROUP_SCHED
+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
+index 978fcfca5..df671145b 100644
+--- a/kernel/sched/Makefile
++++ b/kernel/sched/Makefile
+@@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o
+ obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+ obj-$(CONFIG_PSI) += psi.o
+ obj-$(CONFIG_SCHED_CORE) += core_sched.o
++obj-$(CONFIG_SCHED_SOFT_DOMAIN) += soft_domain.o
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index ee817573a..c81c03aa9 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -9320,6 +9320,9 @@ void __init sched_init_smp(void)
+ 	init_sched_dl_class();
+ 
+ 	sched_smp_initialized = true;
++
++	build_soft_domain();
++
+ }
+ 
+ static int __init migration_init(void)
+@@ -10009,6 +10012,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ 	return 0;
+ }
+ 
++static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
++{
++	struct task_group *tg = css_tg(css);
++
++	offline_soft_domain(tg);
++}
++
+ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+ {
+ 	struct task_group *tg = css_tg(css);
+@@ -10680,6 +10690,62 @@ static int cpu_override_proc_write_s64(struct cgroup_subsys_state *css,
+ #endif
+ #endif
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++
++static int cpu_soft_domain_write_s64(struct cgroup_subsys_state *css,
++				     struct cftype *cftype,
++				     s64 val)
++{
++	return sched_group_set_soft_domain(css_tg(css), val);
++}
++
++static s64 cpu_soft_domain_read_s64(struct cgroup_subsys_state *css,
++				     struct cftype *cftype)
++{
++	struct task_group *tg = css_tg(css);
++
++	if (!tg->sf_ctx)
++		return 0;
++
++	return (s64)tg->sf_ctx->policy;
++}
++
++static int cpu_soft_domain_quota_write_u64(struct cgroup_subsys_state *css,
++					struct cftype *cftype, u64 val)
++{
++	struct task_group *tg = css_tg(css);
++
++	if (val > cpumask_weight(cpumask_of_node(0)))
++		return -EINVAL;
++
++	return sched_group_set_soft_domain_quota(tg, val);
++}
++
++static u64 cpu_soft_domain_quota_read_u64(struct cgroup_subsys_state *css,
++					  struct cftype *cftype)
++{
++	struct task_group *tg = css_tg(css);
++
++	if (!tg->sf_ctx)
++		return 0;
++
++	return (u64)tg->sf_ctx->nr_cpus;
++}
++
++static int soft_domain_cpu_list_seq_show(struct seq_file *sf, void *v)
++{
++	struct task_group *tg = css_tg(seq_css(sf));
++
++	if (!tg->sf_ctx)
++		return 0;
++
++	seq_printf(sf, "%*pbl\n", cpumask_pr_args(to_cpumask(tg->sf_ctx->span)));
++
++	return 0;
++}
++
++#endif
++
+ static struct cftype cpu_legacy_files[] = {
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	{
+@@ -10700,6 +10766,25 @@ static struct cftype cpu_legacy_files[] = {
+ 		.write_s64 = cpu_override_proc_write_s64,
+ 	},
+ #endif
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++	{
++		.name = "soft_domain",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.read_s64 = cpu_soft_domain_read_s64,
++		.write_s64 = cpu_soft_domain_write_s64,
++	},
++	{
++		.name = "soft_domain_nr_cpu",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.read_u64 = cpu_soft_domain_quota_read_u64,
++		.write_u64 = cpu_soft_domain_quota_write_u64,
++	},
++	{
++		.name = "soft_domain_cpu_list",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.seq_show = soft_domain_cpu_list_seq_show,
++	},
++#endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+ 		.name = "cfs_quota_us",
+@@ -10951,6 +11036,7 @@ static struct cftype cpu_files[] = {
+ struct cgroup_subsys cpu_cgrp_subsys = {
+ 	.css_alloc	= cpu_cgroup_css_alloc,
+ 	.css_online	= cpu_cgroup_css_online,
++	.css_offline	= cpu_cgroup_css_offline,
+ 	.css_released	= cpu_cgroup_css_released,
+ 	.css_free	= cpu_cgroup_css_free,
+ 	.css_extra_stat_show = cpu_extra_stat_show,
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 622c3e99e..b9e3a63ad 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6196,6 +6196,55 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ static struct sched_group *
+ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++static inline bool sched_group_sf_preferred(struct task_struct *p, struct sched_group *group)
++{
++	struct soft_domain_ctx *ctx = NULL;
++
++	if (!sched_feat(SOFT_DOMAIN))
++		return true;
++
++	ctx = task_group(p)->sf_ctx;
++	if (!ctx || ctx->policy == 0)
++		return true;
++
++	if (!cpumask_intersects(sched_group_span(group), to_cpumask(ctx->span)))
++		return false;
++
++	return true;
++}
++
++static inline bool cpu_is_sf_preferred(struct task_struct *p, int cpu)
++{
++	struct soft_domain_ctx *ctx = NULL;
++
++	if (!sched_feat(SOFT_DOMAIN))
++		return true;
++
++	ctx = task_group(p)->sf_ctx;
++	if (!ctx || ctx->policy == 0)
++		return true;
++
++	if (!cpumask_test_cpu(cpu, to_cpumask(ctx->span)))
++		return false;
++
++	return true;
++}
++#else
++
++static inline bool sched_group_sf_preferred(struct task_struct *p, struct sched_group *group)
++{
++	return true;
++}
++
++static inline bool cpu_is_sf_preferred(struct task_struct *p, int cpu)
++{
++	return true;
++}
++
++#endif
++
++
+ /*
+  * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+  */
+@@ -6220,6 +6269,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
+ 		if (!sched_core_cookie_match(rq, p))
+ 			continue;
+ 
++		if (!cpu_is_sf_preferred(p, i))
++			continue;
++
+ 		if (available_idle_cpu(i)) {
+ 			struct cpuidle_state *idle = idle_get_state(rq);
+ 			if (idle && idle->exit_latency < min_exit_latency) {
+@@ -6534,6 +6586,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 		}
+ 	}
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++	if (sched_feat(SOFT_DOMAIN)) {
++		struct task_group *tg = task_group(p);
++
++		if (tg->sf_ctx && tg->sf_ctx->policy != 0) {
++			struct cpumask *tmpmask = to_cpumask(tg->sf_ctx->span);
++
++			for_each_cpu_wrap(cpu, tmpmask, target + 1) {
++				if (!cpumask_test_cpu(cpu, cpus))
++					continue;
++
++				if (has_idle_core) {
++					i = select_idle_core(p, cpu, cpus, &idle_cpu);
++					if ((unsigned int)i < nr_cpumask_bits)
++						return i;
++
++				} else {
++					if (--nr <= 0)
++						return -1;
++					i = __select_idle_cpu(cpu, p, &idle_cpu);
++					if ((unsigned int)i < nr_cpumask_bits)
++						return i;
++				}
++			}
++
++			if (idle_cpu != -1)
++				return idle_cpu;
++
++			cpumask_andnot(cpus, cpus, tmpmask);
++		}
++
++	}
++#endif
++
+ 	if (static_branch_unlikely(&sched_cluster_active)) {
+ 		struct sched_group *sg = sd->groups;
+ 
+@@ -7193,6 +7279,36 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 	return target;
+ }
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++static int wake_soft_domain(struct task_struct *p, int target, int *cpu, int sd_flags)
++{
++	struct cpumask *mask = this_cpu_cpumask_var_ptr(select_idle_mask);
++	struct soft_domain_ctx *ctx = NULL;
++
++	ctx = task_group(p)->sf_ctx;
++	if (!ctx || ctx->policy == 0)
++		goto out;
++
++#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
++	cpumask_and(mask, to_cpumask(ctx->span), p->select_cpus);
++#else
++	cpumask_and(mask, to_cpumask(ctx->span), p->cpus_ptr);
++#endif
++	cpumask_and(mask, mask, cpu_active_mask);
++	if (cpumask_empty(mask) || cpumask_test_cpu(target, mask))
++		goto prefer;
++	else
++		target = cpumask_any_and_distribute(mask, mask);
++
++prefer:
++	if (sd_flags & SD_BALANCE_FORK)
++		*cpu = target;
++out:
++
++	return target;
++}
++#endif
++
+ /*
+  * select_task_rq_fair: Select target runqueue for the waking task in domains
+  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
+@@ -7232,6 +7348,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 	}
+ 
+ 	rcu_read_lock();
++
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++	if (sched_feat(SOFT_DOMAIN))
++		new_cpu = prev_cpu = wake_soft_domain(p, prev_cpu, &cpu, sd_flag);
++#endif
++
+ 	for_each_domain(cpu, tmp) {
+ 		/*
+ 		 * If both 'cpu' and 'prev_cpu' are part of this domain,
+@@ -8109,6 +8231,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ 		return 0;
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++	/* Do not migrate soft domain tasks to outside of prefer cluster. */
++	if (sched_feat(SOFT_DOMAIN)) {
++		struct soft_domain_ctx *ctx = task_group(p)->sf_ctx;
++
++		if (ctx && ctx->policy &&
++		    !cpumask_test_cpu(env->dst_cpu, to_cpumask(ctx->span)))
++			return 0;
++	}
++#endif
++
+ 	/* Disregard pcpu kthreads; they are where they need to be. */
+ 	if (kthread_is_per_cpu(p))
+ 		return 0;
+@@ -9409,6 +9542,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ 		if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ 			continue;
+ 
++		/* Skip over this group if not in soft domain */
++		if (!sched_group_sf_preferred(p, group))
++			continue;
++
+ 		local_group = cpumask_test_cpu(this_cpu,
+ 					       sched_group_span(group));
+ 
+@@ -11859,6 +11996,8 @@ void free_fair_sched_group(struct task_group *tg)
+ {
+ 	int i;
+ 
++	destroy_soft_domain(tg);
++
+ 	for_each_possible_cpu(i) {
+ 		if (tg->cfs_rq)
+ 			kfree(tg->cfs_rq[i]);
+@@ -11874,7 +12013,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ {
+ 	struct sched_entity *se;
+ 	struct cfs_rq *cfs_rq;
+-	int i;
++	int i, ret;
+ 
+ 	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
+ 	if (!tg->cfs_rq)
+@@ -11891,6 +12030,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 
+ 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ 
++	ret = init_soft_domain(tg, parent);
++	if (ret)
++		goto err;
++
+ 	for_each_possible_cpu(i) {
+ 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ 				      GFP_KERNEL, cpu_to_node(i));
+@@ -11912,6 +12055,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ err_free_rq:
+ 	kfree(cfs_rq);
+ err:
++	destroy_soft_domain(tg);
+ 	return 0;
+ }
+ 
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 1b0979005..9d0d29a12 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -108,3 +108,7 @@ SCHED_FEAT(BASE_SLICE, true)
+  */
+ SCHED_FEAT(PREFER_HIGH_WEIGHT, true)
+ #endif
++
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++SCHED_FEAT(SOFT_DOMAIN, false)
++#endif
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 268760576..abd80048c 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -386,6 +386,16 @@ struct cfs_bandwidth {
+ #endif
+ };
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++
++struct soft_domain_ctx {
++	int			policy;
++	int			nr_cpus;
++	struct soft_domain	*sf_d;
++	unsigned long		span[];
++};
++#endif
++
+ /* Task group related information */
+ struct task_group {
+ 	struct cgroup_subsys_state css;
+@@ -441,6 +451,9 @@ struct task_group {
+ 	struct uclamp_se	uclamp[UCLAMP_CNT];
+ #endif
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++	struct soft_domain_ctx *sf_ctx;
++#endif
+ };
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+@@ -3190,3 +3203,31 @@ extern int sched_dynamic_mode(const char *str);
+ extern void sched_dynamic_update(int mode);
+ #endif
+ 
++#ifdef CONFIG_SCHED_SOFT_DOMAIN
++void build_soft_domain(void);
++int init_soft_domain(struct task_group *tg, struct task_group *parent);
++int destroy_soft_domain(struct task_group *tg);
++void offline_soft_domain(struct task_group *tg);
++int sched_group_set_soft_domain(struct task_group *tg, long val);
++int sched_group_set_soft_domain_quota(struct task_group *tg, long val);
++
++static inline struct cpumask *soft_domain_span(unsigned long span[])
++{
++	return to_cpumask(span);
++}
++#else
++
++static inline void build_soft_domain(void) { }
++static inline int init_soft_domain(struct task_group *tg, struct task_group *parent)
++{
++	return 0;
++}
++
++static inline void offline_soft_domain(struct task_group *tg) { }
++
++static inline int destroy_soft_domain(struct task_group *tg)
++{
++	return 0;
++}
++
++#endif
+diff --git a/kernel/sched/soft_domain.c b/kernel/sched/soft_domain.c
+new file mode 100644
+index 000000000..0ed239cdb
+--- /dev/null
++++ b/kernel/sched/soft_domain.c
+@@ -0,0 +1,500 @@
++// SPDX-License-Identifier: GPL-2.0+
++/*
++ * Common code for Soft Domain Scheduling
++ *
++ * Copyright (C) 2025-2025 Huawei Technologies Co., Ltd
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ */
++
++#include "sched.h"
++#include <linux/sort.h>
++
++static DEFINE_STATIC_KEY_TRUE(__soft_domain_switch);
++
++static int __init soft_domain_switch_setup(char *str)
++{
++	int val = 0;
++
++	if (kstrtoint(str, 0, &val))
++		pr_warn("sched_soft_domain parameter is error: %s\n", str);
++	else {
++		if (val == 1)
++			static_branch_enable(&__soft_domain_switch);
++		else if (val == 0)
++			static_branch_disable(&__soft_domain_switch);
++	}
++
++	return 1;
++}
++__setup("sched_soft_domain=", soft_domain_switch_setup);
++
++static bool soft_domain_enabled(void)
++{
++	return static_branch_likely(&__soft_domain_switch);
++}
++
++static DEFINE_PER_CPU(struct soft_domain *, g_sf_d);
++
++static void free_sub_soft_domain(struct soft_domain *sf_d);
++
++static int build_soft_sub_domain(int nid, struct cpumask *cpus)
++{
++	struct cpumask *span = cpumask_of_node(nid);
++	struct soft_domain *sf_d = NULL;
++	int i;
++
++	sf_d = kzalloc_node(sizeof(struct soft_domain) + cpumask_size(),
++			    GFP_KERNEL, nid);
++	if (!sf_d)
++		return -ENOMEM;
++
++	INIT_LIST_HEAD(&sf_d->child_domain);
++	sf_d->nr_available_cpus = cpumask_weight(span);
++	cpumask_copy(to_cpumask(sf_d->span), span);
++
++	for_each_cpu_and(i, span, cpus) {
++		struct soft_subdomain *sub_d = NULL;
++
++		sub_d = kzalloc_node(sizeof(struct soft_subdomain) + cpumask_size(),
++				     GFP_KERNEL, nid);
++		if (!sub_d) {
++			free_sub_soft_domain(sf_d);
++			return -ENOMEM;
++		}
++		list_add_tail(&sub_d->node, &sf_d->child_domain);
++               cpumask_and(soft_domain_span(sub_d->span), span, topology_cluster_cpumask(i));
++               cpumask_andnot(cpus, cpus, topology_cluster_cpumask(i));
++	}
++
++	for_each_cpu(i, span) {
++		rcu_assign_pointer(per_cpu(g_sf_d, i), sf_d);
++	}
++
++	return 0;
++}
++
++static void free_sub_soft_domain(struct soft_domain *sf_d)
++{
++	struct list_head *children = &sf_d->child_domain;
++	struct soft_subdomain *entry = NULL, *next = NULL;
++	int i;
++
++	list_for_each_entry_safe(entry, next, children, node) {
++		list_del(&entry->node);
++		kfree(entry);
++	}
++
++	for_each_cpu(i, to_cpumask(sf_d->span)) {
++		rcu_assign_pointer(per_cpu(g_sf_d, i), NULL);
++	}
++
++	kfree(sf_d);
++}
++
++static void free_soft_domain(void)
++{
++	struct soft_domain *sf_d = NULL;
++	int i;
++
++	for_each_cpu(i, cpu_active_mask) {
++		sf_d = rcu_dereference(per_cpu(g_sf_d, i));
++		if (sf_d)
++			free_sub_soft_domain(sf_d);
++	}
++
++	static_branch_disable(&__soft_domain_switch);
++}
++
++void build_soft_domain(void)
++{
++	static struct cpumask cpus;
++	int i, ret;
++
++	if (!soft_domain_enabled())
++		return;
++
++	cpumask_copy(&cpus, cpu_active_mask);
++	rcu_read_lock();
++	for (i = 0; i < nr_node_ids; ++ i) {
++		/* build soft domain for each numa domain. */
++		ret = build_soft_sub_domain(i, &cpus);
++		if (ret) {
++			free_soft_domain();
++			goto out;
++		}
++	}
++
++out:
++	rcu_read_unlock();
++}
++
++static DEFINE_MUTEX(soft_domain_mutex);
++
++#define NR_MAX_CLUSTER 16
++
++struct domain_node {
++	struct soft_subdomain *sud_d;
++	unsigned int attached;
++	int idx;
++	unsigned long util;
++};
++
++static int subdomain_cmp(const void *a, const void *b)
++{
++	struct domain_node *ca = (struct domain_node *)a;
++	struct domain_node *cb = (struct domain_node *)b;
++
++	if (ca->attached == cb->attached && ca->attached == 0) {
++		if (ca->idx < cb->idx)
++			return -1;
++		else
++			return 1;
++	}
++
++	if (ca->attached < cb->attached ||
++	    (ca->attached == cb->attached && ca->util < cb->util))
++		return -1;
++
++	return 1;
++}
++
++struct soft_domain_args {
++	int policy;
++	int nr_cpu;
++	struct cpumask *cpus;
++};
++
++static int tg_set_soft_domain(struct task_group *tg, void *data)
++{
++	struct soft_domain_args *args = (struct soft_domain_args *)data;
++
++	tg->sf_ctx->policy = args->policy;
++	if (args->policy) {
++		cpumask_copy(to_cpumask(tg->sf_ctx->span), args->cpus);
++		tg->sf_ctx->nr_cpus = args->nr_cpu;
++	} else
++		cpumask_clear(to_cpumask(tg->sf_ctx->span));
++
++	return 0;
++}
++
++static int __calc_cpu(struct task_group *tg)
++{
++	int nr_cpu = 1;
++
++	if (tg->sf_ctx->nr_cpus)
++		nr_cpu = tg->sf_ctx->nr_cpus;
++#ifdef CONFIG_CFS_BANDWIDTH
++	else if (tg->cfs_bandwidth.quota != RUNTIME_INF)
++		nr_cpu = DIV_ROUND_UP_ULL(tg->cfs_bandwidth.quota, tg->cfs_bandwidth.period);
++#endif
++
++	return nr_cpu;
++}
++
++static unsigned long sum_util(struct cpumask *mask)
++{
++	unsigned long sum = 0;
++	int cpu;
++
++	for_each_cpu(cpu, mask)
++		sum += cpu_util_cfs(cpu_rq(cpu));
++
++	return sum;
++}
++
++static int __check_policy(struct task_group *tg, void *data)
++{
++	return !!tg->sf_ctx->policy;
++}
++
++static int check_policy(struct task_group *tg, long policy)
++{
++	int ret;
++
++	rcu_read_lock();
++	ret = walk_tg_tree_from(tg, __check_policy, tg_nop, NULL);
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static struct soft_domain *find_idlest_llc(long policy,
++					    int nr_cpu, cpumask_var_t cpus)
++{
++	int cpu;
++	int max_cpu = 0;
++	struct soft_domain *idlest = NULL;
++	unsigned long min_util = ULONG_MAX;
++
++	/* The user has specified the llc. */
++	if (policy > 0) {
++		for_each_cpu(cpu, cpumask_of_node(policy-1)) {
++			idlest = rcu_dereference(per_cpu(g_sf_d, cpu));
++			if (idlest != NULL)
++				break;
++		}
++
++		if (idlest && nr_cpu <= cpumask_weight(to_cpumask(idlest->span)))
++			return idlest;
++
++		return NULL;
++	}
++
++	cpumask_copy(cpus, cpu_active_mask);
++	for_each_cpu(cpu, cpus) {
++		struct soft_domain *sf_d = NULL;
++		struct cpumask *mask;
++
++		sf_d = rcu_dereference(per_cpu(g_sf_d, cpu));
++		if (sf_d == NULL)
++			continue;
++
++		mask = to_cpumask(sf_d->span);
++		cpumask_andnot(cpus, cpus, mask);
++		if (nr_cpu > cpumask_weight(mask))
++			continue;
++
++		/*
++		 * LLC selection order:
++		 * 1. When the number of idle cpus meet the requirements,
++		 *    the one with more idles cpus is better;
++		 * 2. Under the condition of insufficient idle cpus, util
++		 *    is lower, the better.
++		 */
++		if (sf_d->nr_available_cpus > max_cpu &&
++			nr_cpu <= sf_d->nr_available_cpus) {
++			max_cpu = sf_d->nr_available_cpus;
++			idlest = sf_d;
++		} else if (max_cpu == 0) {   /* No llc meets the demand */
++			unsigned long util = sum_util(mask);
++
++			if (idlest == NULL || util < min_util) {
++				idlest = sf_d;
++				min_util = util;
++			}
++		}
++	}
++
++	return idlest;
++}
++
++static int __sched_group_set_soft_domain(struct task_group *tg, long policy)
++{
++	int cpu;
++	int ret = 0;
++	cpumask_var_t cpus;
++	struct soft_domain_args args;
++	struct soft_domain *sf_d = NULL;
++	struct domain_node nodes[NR_MAX_CLUSTER] = {0};
++	int nr_cpu = __calc_cpu(tg);
++
++	if (check_policy(tg, policy))
++		return -EINVAL;
++
++	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
++		return -EINVAL;
++
++	rcu_read_lock();
++	/* 1. Find a idlest llc. */
++	sf_d = find_idlest_llc(policy, nr_cpu, cpus);
++	if (sf_d != NULL) {
++		/* 2. select idlest clusters. */
++		struct list_head *children = &sf_d->child_domain;
++		struct soft_subdomain *sub_d = NULL;
++		int nr = 0, i;
++		struct cpumask *tmpmask = NULL;
++		int tmp_cpu = nr_cpu;
++
++		list_for_each_entry(sub_d, children, node) {
++			nodes[nr].sud_d = sub_d;
++			nodes[nr].attached = sub_d->attached;
++			tmpmask = to_cpumask(sub_d->span);
++			cpu = cpumask_first(tmpmask);
++			nodes[nr].util = sum_util(tmpmask);
++			nodes[nr].idx = nr;
++			nr++;
++		}
++
++		cpumask_clear(cpus);
++
++		sort(nodes, nr, sizeof(struct domain_node), subdomain_cmp, NULL);
++		sf_d->nr_available_cpus -= tmp_cpu;
++		for (i = 0; i < nr; i++) {
++			sub_d = nodes[i].sud_d;
++			tmpmask = to_cpumask(sub_d->span);
++			cpumask_or(cpus, cpus, tmpmask);
++			sub_d->attached++;
++			nr_cpu -= cpumask_weight(tmpmask);
++			if (nr_cpu <= 0)
++				break;
++		}
++
++		/* 3. attach task group to softdomain. */
++		args.policy = policy;
++		args.cpus = cpus;
++		args.nr_cpu = tmp_cpu;
++		walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args);
++
++		/*
++		 * 4.add tg to llc domain task_groups list for load balance.
++		 */
++		tg->sf_ctx->nr_cpus = tmp_cpu;
++		tg->sf_ctx->sf_d = sf_d;
++	} else {
++		ret = -EINVAL;
++	}
++	rcu_read_unlock();
++
++	free_cpumask_var(cpus);
++
++	return ret;
++}
++
++static int __sched_group_unset_soft_domain(struct task_group *tg)
++{
++	struct soft_domain_args args = {
++		.policy = 0,
++	};
++	struct soft_domain *sf_d = NULL;
++	struct soft_subdomain *sub_d = NULL;
++	struct list_head *children = NULL;
++
++	/* If parent has set soft domain, child group can't unset itself. */
++	if (tg->parent->sf_ctx != NULL && tg->parent->sf_ctx->policy != 0)
++		return -EINVAL;
++
++	sf_d = tg->sf_ctx->sf_d;
++	sf_d->nr_available_cpus += __calc_cpu(tg);
++	children = &sf_d->child_domain;
++
++	list_for_each_entry(sub_d, children, node) {
++		if (cpumask_intersects(to_cpumask(tg->sf_ctx->span), to_cpumask(sub_d->span)))
++			sub_d->attached--;
++	}
++
++	walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args);
++
++	return 0;
++}
++
++int sched_group_set_soft_domain(struct task_group *tg, long val)
++{
++	int ret = 0;
++
++	if (!soft_domain_enabled())
++		return -EPERM;
++
++	if (val < -1 || val > nr_node_ids)
++		return -EINVAL;
++
++	mutex_lock(&soft_domain_mutex);
++
++	/* If enable or disable is repeated, directly return. */
++	if (!!tg->sf_ctx->policy == !!val)
++		goto out;
++
++	if (val == 0)
++		ret = __sched_group_unset_soft_domain(tg);
++	else
++		ret = __sched_group_set_soft_domain(tg, val);
++
++	if (!ret)
++		tg->sf_ctx->policy = val;
++
++out:
++	mutex_unlock(&soft_domain_mutex);
++
++	return ret;
++}
++
++int sched_group_set_soft_domain_quota(struct task_group *tg, long val)
++{
++	int ret = 0;
++
++	if (!soft_domain_enabled())
++		return -EPERM;
++
++	mutex_lock(&soft_domain_mutex);
++	if (tg->sf_ctx->policy != 0) {
++		ret = -EINVAL;
++		goto out;
++	} else
++		tg->sf_ctx->nr_cpus = (int)val;
++
++out:
++	mutex_unlock(&soft_domain_mutex);
++
++	return ret;
++}
++
++int init_soft_domain(struct task_group *tg, struct task_group *parent)
++{
++	struct soft_domain_ctx *sf_ctx = NULL;
++	struct soft_domain_ctx *psf_ctx = NULL;
++
++	if (!soft_domain_enabled())
++		return 0;
++
++	sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL);
++	if (!sf_ctx)
++		return -ENOMEM;
++
++	mutex_lock(&soft_domain_mutex);
++	psf_ctx = parent->sf_ctx;
++	if (psf_ctx) {
++		sf_ctx->policy = psf_ctx->policy;
++		sf_ctx->nr_cpus = psf_ctx->nr_cpus;
++		cpumask_copy(to_cpumask(sf_ctx->span), to_cpumask(psf_ctx->span));
++	}
++
++	tg->sf_ctx = sf_ctx;
++	mutex_unlock(&soft_domain_mutex);
++
++	return 0;
++}
++
++void offline_soft_domain(struct task_group *tg)
++{
++	struct soft_domain_ctx *sf_ctx = NULL;
++	struct soft_domain_ctx *psf_ctx = NULL;
++
++	if (!soft_domain_enabled())
++		return;
++
++	sf_ctx = tg->sf_ctx;
++	psf_ctx = tg->parent->sf_ctx;
++
++	if (!sf_ctx)
++		return;
++
++	mutex_lock(&soft_domain_mutex);
++	if (sf_ctx->policy != 0) {
++		/*
++		 * parent group is not set, this group set
++		 * soft domain by user.
++		 */
++		if (psf_ctx == NULL || psf_ctx->policy == 0)
++			__sched_group_unset_soft_domain(tg);
++	}
++	mutex_unlock(&soft_domain_mutex);
++}
++
++int destroy_soft_domain(struct task_group *tg)
++{
++	if (!soft_domain_enabled())
++		return 0;
++
++	kfree(tg->sf_ctx);
++
++	return 0;
++}
+-- 
+2.20.1
+
diff --git a/0005-net-venetcls-introduce-venetcls-for-network-optimiza.patch b/0005-net-venetcls-introduce-venetcls-for-network-optimiza.patch
new file mode 100644
index 0000000000000000000000000000000000000000..80dcecbe1d16ff2604c11fe538363ff74c55fc2f
--- /dev/null
+++ b/0005-net-venetcls-introduce-venetcls-for-network-optimiza.patch
@@ -0,0 +1,3008 @@
+From 8be6d6f669b88c735cfe2a94e12dea20ebb0f87f Mon Sep 17 00:00:00 2001
+From: Yue Haibing <yuehaibing@huawei.com>
+Date: Tue, 5 Aug 2025 16:05:52 +0800
+Subject: [PATCH] net/venetcls: introduce venetcls for network optimization
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/ICBFCS
+CVE: NA
+
+--------------------------------
+
+This introduces a kind of network optimization method named venetcls. It
+can configure the ntuple rule, and bind interrupt to the netdev queue
+automatically.
+
+Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
+Signed-off-by: Wang Liang <wangliang74@huawei.com>
+Signed-off-by: Liu Jian <liujian56@huawei.com>
+Signed-off-by: yuelg <yuelg@chinaunicom.cn>
+---
+ include/linux/netdevice.h      |    3 +
+ include/linux/venetcls.h       |   97 +++
+ kernel/irq/irqdesc.c           |    2 +-
+ net/Kconfig                    |    1 +
+ net/Makefile                   |    1 +
+ net/core/dev.c                 |   22 +
+ net/ipv4/af_inet.c             |    6 +
+ net/ipv4/tcp.c                 |    9 +
+ net/venetcls/Kconfig           |    7 +
+ net/venetcls/Makefile          |    7 +
+ net/venetcls/asmdefs.h         |   61 ++
+ net/venetcls/memcpy-sve.S      |  157 +++++
+ net/venetcls/venetcls.h        |  187 ++++++
+ net/venetcls/venetcls_flow.c   |  491 +++++++++++++++
+ net/venetcls/venetcls_main.c   | 1086 ++++++++++++++++++++++++++++++++
+ net/venetcls/venetcls_ntuple.c |  643 +++++++++++++++++++
+ 16 files changed, 2779 insertions(+), 1 deletion(-)
+ create mode 100644 include/linux/venetcls.h
+ create mode 100644 net/venetcls/Kconfig
+ create mode 100644 net/venetcls/Makefile
+ create mode 100644 net/venetcls/asmdefs.h
+ create mode 100644 net/venetcls/memcpy-sve.S
+ create mode 100644 net/venetcls/venetcls.h
+ create mode 100644 net/venetcls/venetcls_flow.c
+ create mode 100644 net/venetcls/venetcls_main.c
+ create mode 100644 net/venetcls/venetcls_ntuple.c
+
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index cc1f14f3c..e5f876cec 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -766,6 +766,9 @@ struct netdev_rx_queue {
+ 	struct xsk_buff_pool            *pool;
+ #endif
+ 	struct file __rcu		*dmabuf_pages;
++#if IS_ENABLED(CONFIG_VENETCLS)
++	void __rcu			*vecls_ftb;
++#endif
+ } ____cacheline_aligned_in_smp;
+ 
+ struct page *
+diff --git a/include/linux/venetcls.h b/include/linux/venetcls.h
+new file mode 100644
+index 000000000..792991155
+--- /dev/null
++++ b/include/linux/venetcls.h
+@@ -0,0 +1,97 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#ifndef _LINUX_VENETCLS_H
++#define _LINUX_VENETCLS_H
++
++struct vecls_hook_ops {
++	void (*vecls_cfg_rxcls)(struct sock *sk, int is_del);
++	void (*vecls_flow_update)(struct sock *sk);
++	void (*vecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail);
++	bool (*vecls_timeout)(struct net_device *dev, u16 rxq_index,
++				u32 flow_id, u16 filter_id);
++};
++
++typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail);
++extern const struct vecls_hook_ops __rcu *vecls_ops;
++
++static inline void venetcls_cfg_rxcls(struct sock *sk, int is_del)
++{
++	const struct vecls_hook_ops *ops;
++
++	rcu_read_lock();
++	ops = rcu_dereference(vecls_ops);
++	if (ops && ops->vecls_cfg_rxcls)
++		ops->vecls_cfg_rxcls(sk, is_del);
++	rcu_read_unlock();
++}
++
++static inline void venetcls_flow_update(struct sock *sk)
++{
++	const struct vecls_hook_ops *ops;
++
++	rcu_read_lock();
++	ops = rcu_dereference(vecls_ops);
++	if (ops && ops->vecls_flow_update)
++		ops->vecls_flow_update(sk);
++	rcu_read_unlock();
++}
++
++static inline bool
++venetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int* ret)
++{
++	const struct vecls_hook_ops *ops;
++	int cpu, last_qtail;
++	bool result = false;
++
++	rcu_read_lock();
++	ops = rcu_dereference(vecls_ops);
++	if (ops && ops->vecls_set_cpu) {
++		ops->vecls_set_cpu(skb, &cpu, &last_qtail);
++		if (cpu >= 0) {
++			*ret = enq_func(skb, cpu, &last_qtail);
++			result = true;
++		}
++	}
++	rcu_read_unlock();
++	return result;
++}
++
++static inline void
++venetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func)
++{
++	const struct vecls_hook_ops *ops;
++	struct sk_buff *skb, *next;
++	int cpu, last_qtail;
++
++	rcu_read_lock();
++	ops = rcu_dereference(vecls_ops);
++	if (ops && ops->vecls_set_cpu) {
++		list_for_each_entry_safe(skb, next, head, list) {
++			ops->vecls_set_cpu(skb, &cpu, &last_qtail);
++			if (cpu >= 0) {
++				skb_list_del_init(skb);
++				enq_func(skb, cpu, &last_qtail);
++			}
++		}
++	}
++	rcu_read_unlock();
++	return;
++}
++
++static inline bool venetcls_may_expire_flow(struct net_device *dev,
++					    u16 rxq_index, u32 flow_id,
++					    u16 filter_id, bool *expire)
++{
++	const struct vecls_hook_ops *ops;
++	bool ret = false;
++
++	rcu_read_lock();
++	ops = rcu_dereference(vecls_ops);
++	if (ops && ops->vecls_timeout) {
++		*expire = ops->vecls_timeout(dev, rxq_index, flow_id, filter_id);
++		ret = true;
++	}
++	rcu_read_unlock();
++	return ret;
++}
++
++#endif  /* _LINUX_VENETCLS_H */
+diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
+index 8202d4a99..eb8641e22 100644
+--- a/kernel/irq/irqdesc.c
++++ b/kernel/irq/irqdesc.c
+@@ -366,7 +366,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
+ {
+ 	return radix_tree_lookup(&irq_desc_tree, irq);
+ }
+-#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
++#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_VENETCLS)
+ EXPORT_SYMBOL_GPL(irq_to_desc);
+ #endif
+ 
+diff --git a/net/Kconfig b/net/Kconfig
+index dc8451e75..2b68c0f86 100644
+--- a/net/Kconfig
++++ b/net/Kconfig
+@@ -72,6 +72,7 @@ source "net/xfrm/Kconfig"
+ source "net/iucv/Kconfig"
+ source "net/smc/Kconfig"
+ source "net/xdp/Kconfig"
++source "net/venetcls/Kconfig"
+ 
+ config INET
+ 	bool "TCP/IP networking"
+diff --git a/net/Makefile b/net/Makefile
+index 6a62e5b27..a2cb1281e 100644
+--- a/net/Makefile
++++ b/net/Makefile
+@@ -78,3 +78,4 @@ obj-$(CONFIG_NET_NCSI)		+= ncsi/
+ obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
+ obj-$(CONFIG_MPTCP)		+= mptcp/
+ obj-$(CONFIG_MCTP)		+= mctp/
++obj-$(CONFIG_VENETCLS)		+= venetcls/
+diff --git a/net/core/dev.c b/net/core/dev.c
+index f628494a1..1cd6b5413 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -160,6 +160,12 @@
+ /* This should be increased if a protocol with a bigger head is added. */
+ #define GRO_MAX_HEAD (MAX_HEADER + 128)
+ 
++#if IS_ENABLED(CONFIG_VENETCLS)
++#include <linux/venetcls.h>
++const struct vecls_hook_ops __rcu *vecls_ops __read_mostly;
++EXPORT_SYMBOL_GPL(vecls_ops);
++#endif
++
+ static DEFINE_SPINLOCK(ptype_lock);
+ static DEFINE_SPINLOCK(offload_lock);
+ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+@@ -4770,6 +4776,10 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ 	bool expire = true;
+ 	unsigned int cpu;
+ 
++#if IS_ENABLED(CONFIG_VENETCLS)
++	if (venetcls_may_expire_flow(dev, rxq_index, flow_id, filter_id, &expire))
++		return expire;
++#endif
+ 	rcu_read_lock();
+ 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ 	if (flow_table && flow_id <= flow_table->mask) {
+@@ -5881,6 +5891,12 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
+ 			return ret;
+ 		}
+ 	}
++#endif
++#if IS_ENABLED(CONFIG_VENETCLS)
++	if (venetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) {
++		rcu_read_unlock();
++		return ret;
++	}
+ #endif
+ 	ret = __netif_receive_skb(skb);
+ 	rcu_read_unlock();
+@@ -5915,6 +5931,9 @@ static void netif_receive_skb_list_internal(struct list_head *head)
+ 			}
+ 		}
+ 	}
++#endif
++#if IS_ENABLED(CONFIG_VENETCLS)
++	venetcls_skblist_set_cpu(head, enqueue_to_backlog);
+ #endif
+ 	__netif_receive_skb_list(head);
+ 	rcu_read_unlock();
+@@ -10271,6 +10290,9 @@ int __netdev_update_features(struct net_device *dev)
+ 
+ 	return err < 0 ? 0 : 1;
+ }
++#if IS_ENABLED(CONFIG_VENETCLS)
++EXPORT_SYMBOL(__netdev_update_features);
++#endif
+ 
+ static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
+ {
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index 5dc1955e3..06b917182 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -120,6 +120,9 @@
+ #include <net/compat.h>
+ 
+ #include <trace/events/sock.h>
++#if IS_ENABLED(CONFIG_VENETCLS)
++#include <linux/venetcls.h>
++#endif
+ 
+ /* The inetsw table contains everything that inet_create needs to
+  * build a new socket.
+@@ -229,6 +232,9 @@ int inet_listen(struct socket *sock, int backlog)
+ 		if (err)
+ 			goto out;
+ 		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
++#if IS_ENABLED(CONFIG_VENETCLS)
++		venetcls_cfg_rxcls(sk, 0);
++#endif
+ 	}
+ 	err = 0;
+ 
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index e8b7f0c5d..cc84873ce 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -281,6 +281,9 @@
+ #include <asm/ioctls.h>
+ #include <net/busy_poll.h>
+ #include <linux/dma-buf.h>
++#if IS_ENABLED(CONFIG_VENETCLS)
++#include <linux/venetcls.h>
++#endif
+ 
+ /* Track pending CMSGs. */
+ enum {
+@@ -2940,6 +2943,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ 	if (unlikely(flags & MSG_ERRQUEUE))
+ 		return inet_recv_error(sk, msg, len, addr_len);
+ 
++#if IS_ENABLED(CONFIG_VENETCLS)
++	venetcls_flow_update(sk);
++#endif
+ 	if (sk_can_busy_loop(sk) &&
+ 	    skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+ 	    sk->sk_state == TCP_ESTABLISHED)
+@@ -3300,6 +3306,9 @@ void __tcp_close(struct sock *sk, long timeout)
+ void tcp_close(struct sock *sk, long timeout)
+ {
+ 	lock_sock(sk);
++#if IS_ENABLED(CONFIG_VENETCLS)
++	venetcls_cfg_rxcls(sk, 1);
++#endif
+ 	__tcp_close(sk, timeout);
+ 	release_sock(sk);
+ 	sock_put(sk);
+diff --git a/net/venetcls/Kconfig b/net/venetcls/Kconfig
+new file mode 100644
+index 000000000..cd4d7c8f9
+--- /dev/null
++++ b/net/venetcls/Kconfig
+@@ -0,0 +1,7 @@
++# SPDX-License-Identifier: GPL-2.0-only
++config VENETCLS
++	tristate "Network classification"
++	default n
++	help
++	  Allow to bind NIC interrupts and configure ntuple rules to
++	  achieve sock numa affinity
+diff --git a/net/venetcls/Makefile b/net/venetcls/Makefile
+new file mode 100644
+index 000000000..639a81d7d
+--- /dev/null
++++ b/net/venetcls/Makefile
+@@ -0,0 +1,7 @@
++# SPDX-License-Identifier: GPL-2.0-only
++
++obj-$(CONFIG_VENETCLS) = venetcls.o
++venetcls-y                      := venetcls_main.o venetcls_ntuple.o venetcls_flow.o
++ifeq ($(CONFIG_ARM64_SVE),y)
++venetcls-y                      += memcpy-sve.o
++endif
+diff --git a/net/venetcls/asmdefs.h b/net/venetcls/asmdefs.h
+new file mode 100644
+index 000000000..8138a94c1
+--- /dev/null
++++ b/net/venetcls/asmdefs.h
+@@ -0,0 +1,61 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#ifndef _ASMDEFS_H
++#define _ASMDEFS_H
++
++/* Branch Target Identitication support.  */
++#define BTI_C		hint	34
++#define BTI_J		hint	36
++/* Return address signing support (pac-ret).  */
++#define PACIASP		hint	25; .cfi_window_save
++#define AUTIASP		hint	29; .cfi_window_save
++
++/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
++#define FEATURE_1_AND 0xc0000000
++#define FEATURE_1_BTI 1
++#define FEATURE_1_PAC 2
++
++/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
++#define GNU_PROPERTY(type, value)	\
++	.section .note.gnu.property, "a";	\
++	.p2align 3;				\
++	.word 4;				\
++	.word 16;				\
++	.word 5;				\
++	.asciz "GNU";				\
++	.word type;				\
++	.word 4;				\
++	.word value;				\
++	.word 0;				\
++	.text
++
++#ifndef WANT_GNU_PROPERTY
++#define WANT_GNU_PROPERTY 1
++#endif
++
++#if WANT_GNU_PROPERTY
++/* Add property note with supported features to all asm files.  */
++GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
++#endif
++
++#define ENTRY_ALIGN(name, alignment)	\
++	.global name;		\
++	.type name, %function;	\
++	.align alignment;	\
++name:				\
++	.cfi_startproc;		\
++	BTI_C;
++
++#define ENTRY(name)	ENTRY_ALIGN(name, 6)
++
++#define ENTRY_ALIAS(name)	\
++	.global name;		\
++	.type name, %function;	\
++  name:
++
++#define END(name)	\
++	.cfi_endproc;	\
++	.size name, .-name;
++
++#define L(l) .L ## l
++
++#endif
+diff --git a/net/venetcls/memcpy-sve.S b/net/venetcls/memcpy-sve.S
+new file mode 100644
+index 000000000..106e4c302
+--- /dev/null
++++ b/net/venetcls/memcpy-sve.S
+@@ -0,0 +1,157 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#include "asmdefs.h"
++
++.arch armv8-a+sve
++
++#define dstin	x0
++#define src	x1
++#define count	x2
++#define dst	x3
++#define srcend	x4
++#define dstend	x5
++#define tmp1	x6
++#define vlen	x6
++
++#define A_q	q0
++#define B_q	q1
++#define C_q	q2
++#define D_q	q3
++#define E_q	q4
++#define F_q	q5
++#define G_q	q6
++#define H_q	q7
++
++/* This implementation handles overlaps and supports both memcpy and memmove
++   from a single entry point.  It uses unaligned accesses and branchless
++   sequences to keep the code small, simple and improve performance.
++   SVE vectors are used to speedup small copies.
++
++   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
++   copies of up to 128 bytes, and large copies.  The overhead of the overlap
++   check is negligible since it is only required for large copies.
++
++   Large copies use a software pipelined loop processing 64 bytes per iteration.
++   The source pointer is 16-byte aligned to minimize unaligned accesses.
++   The loop tail is handled by always copying 64 bytes from the end.
++*/
++
++ENTRY_ALIAS (__memmove_aarch64_sve)
++ENTRY (__memcpy_aarch64_sve)
++	cmp	count, 128
++	b.hi	L(copy_long)
++	cntb	vlen
++	cmp	count, vlen, lsl 1
++	b.hi	L(copy32_128)
++
++	whilelo p0.b, xzr, count
++	whilelo p1.b, vlen, count
++	ld1b	z0.b, p0/z, [src, 0, mul vl]
++	ld1b	z1.b, p1/z, [src, 1, mul vl]
++	st1b	z0.b, p0, [dstin, 0, mul vl]
++	st1b	z1.b, p1, [dstin, 1, mul vl]
++	ret
++
++	/* Medium copies: 33..128 bytes.  */
++L(copy32_128):
++	add	srcend, src, count
++	add	dstend, dstin, count
++	ldp	A_q, B_q, [src]
++	ldp	C_q, D_q, [srcend, -32]
++	cmp	count, 64
++	b.hi	L(copy128)
++	stp	A_q, B_q, [dstin]
++	stp	C_q, D_q, [dstend, -32]
++	ret
++
++	/* Copy 65..128 bytes.  */
++L(copy128):
++	ldp	E_q, F_q, [src, 32]
++	cmp	count, 96
++	b.ls	L(copy96)
++	ldp	G_q, H_q, [srcend, -64]
++	stp	G_q, H_q, [dstend, -64]
++L(copy96):
++	stp	A_q, B_q, [dstin]
++	stp	E_q, F_q, [dstin, 32]
++	stp	C_q, D_q, [dstend, -32]
++	ret
++
++	/* Copy more than 128 bytes.  */
++L(copy_long):
++	add	srcend, src, count
++	add	dstend, dstin, count
++
++	/* Use backwards copy if there is an overlap.  */
++	sub	tmp1, dstin, src
++	cmp	tmp1, count
++	b.lo	L(copy_long_backwards)
++
++	/* Copy 16 bytes and then align src to 16-byte alignment.  */
++	ldr	D_q, [src]
++	and	tmp1, src, 15
++	bic	src, src, 15
++	sub	dst, dstin, tmp1
++	add	count, count, tmp1	/* Count is now 16 too large.  */
++	ldp	A_q, B_q, [src, 16]
++	str	D_q, [dstin]
++	ldp	C_q, D_q, [src, 48]
++	subs	count, count, 128 + 16	/* Test and readjust count.  */
++	b.ls	L(copy64_from_end)
++L(loop64):
++	stp	A_q, B_q, [dst, 16]
++	ldp	A_q, B_q, [src, 80]
++	stp	C_q, D_q, [dst, 48]
++	ldp	C_q, D_q, [src, 112]
++	add	src, src, 64
++	add	dst, dst, 64
++	subs	count, count, 64
++	b.hi	L(loop64)
++
++	/* Write the last iteration and copy 64 bytes from the end.  */
++L(copy64_from_end):
++	ldp	E_q, F_q, [srcend, -64]
++	stp	A_q, B_q, [dst, 16]
++	ldp	A_q, B_q, [srcend, -32]
++	stp	C_q, D_q, [dst, 48]
++	stp	E_q, F_q, [dstend, -64]
++	stp	A_q, B_q, [dstend, -32]
++	ret
++
++	/* Large backwards copy for overlapping copies.
++	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
++L(copy_long_backwards):
++	cbz	tmp1, L(return)
++	ldr	D_q, [srcend, -16]
++	and	tmp1, srcend, 15
++	bic	srcend, srcend, 15
++	sub	count, count, tmp1
++	ldp	A_q, B_q, [srcend, -32]
++	str	D_q, [dstend, -16]
++	ldp	C_q, D_q, [srcend, -64]
++	sub	dstend, dstend, tmp1
++	subs	count, count, 128
++	b.ls	L(copy64_from_start)
++
++L(loop64_backwards):
++	str	B_q, [dstend, -16]
++	str	A_q, [dstend, -32]
++	ldp	A_q, B_q, [srcend, -96]
++	str	D_q, [dstend, -48]
++	str	C_q, [dstend, -64]!
++	ldp	C_q, D_q, [srcend, -128]
++	sub	srcend, srcend, 64
++	subs	count, count, 64
++	b.hi	L(loop64_backwards)
++
++	/* Write the last iteration and copy 64 bytes from the start.  */
++L(copy64_from_start):
++	ldp	E_q, F_q, [src, 32]
++	stp	A_q, B_q, [dstend, -32]
++	ldp	A_q, B_q, [src]
++	stp	C_q, D_q, [dstend, -64]
++	stp	E_q, F_q, [dstin, 32]
++	stp	A_q, B_q, [dstin]
++L(return):
++	ret
++
++END (__memcpy_aarch64_sve)
+diff --git a/net/venetcls/venetcls.h b/net/venetcls/venetcls.h
+new file mode 100644
+index 000000000..9e8fb0e0a
+--- /dev/null
++++ b/net/venetcls/venetcls.h
+@@ -0,0 +1,187 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#ifndef _NET_VENETCLS_H
++#define _NET_VENETCLS_H
++#include <linux/if.h>
++#include <linux/mutex.h>
++#include <linux/cpufeature.h>
++
++#define VECLS_MAX_NETDEV_NUM 8
++#define VECLS_MAX_RXQ_NUM_PER_DEV 256
++#define VECLS_MAX_CPU_NUM 1024
++
++#define VECLS_TIMEOUT (5 * HZ)
++#define VECLS_NO_FILTER 0xffff
++#define VECLS_NO_CPU 0xffff
++
++struct vecls_netdev_queue_info {
++	int irq;
++	int affinity_cpu;
++};
++
++struct vecls_netdev_info {
++	char				dev_name[IFNAMSIZ];
++	struct net_device		*netdev;
++	int				rxq_num;
++	struct vecls_netdev_queue_info	rxq[VECLS_MAX_RXQ_NUM_PER_DEV];
++	int				old_filter_state;
++};
++
++struct vecls_rxq {
++	int rxq_id;
++	int status;
++};
++
++struct vecls_numa_clusterinfo {
++	int cluster_id;
++	int cur_freeidx;
++	struct vecls_rxq rxqs[VECLS_MAX_RXQ_NUM_PER_DEV];
++};
++
++struct vecls_numa_bound_dev_info {
++	DECLARE_BITMAP(bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV);
++	struct vecls_numa_clusterinfo *cluster_info;
++};
++
++struct vecls_numa_info {
++	DECLARE_BITMAP(avail_cpus, VECLS_MAX_CPU_NUM);
++	struct vecls_numa_bound_dev_info bound_dev[VECLS_MAX_NETDEV_NUM];
++};
++
++struct cmd_context {
++	char netdev[IFNAMSIZ];
++	u32 dip4;
++	u16 dport;
++	u16 action;
++	u32 ruleid;
++	u32 del_ruleid;
++	int ret_loc;
++};
++
++#define VECLS_SK_RULE_HASHSIZE	256
++#define VECLS_SK_RULE_HASHMASK	(VECLS_SK_RULE_HASHSIZE - 1)
++
++struct vecls_sk_rule_list {
++	struct hlist_head hash[VECLS_SK_RULE_HASHSIZE];
++	/* Mutex to synchronize access to ntuple rule locking */
++	struct mutex mutex;
++};
++
++struct vecls_sk_rule {
++	struct hlist_node node;
++	int devid;
++	void *sk;
++	int dip4;
++	int dport;
++	int action;
++	int ruleid;
++	int nid;
++};
++
++struct vecls_sk_entry {
++	struct hlist_node node;
++	void *sk;
++	u32 sk_rule_hash;
++};
++
++struct vecls_dev_flow {
++	unsigned short cpu;
++	unsigned short filter;
++	unsigned long timeout;
++	int isvalid;
++};
++
++struct vecls_dev_flow_table {
++	unsigned int	mask;
++	struct rcu_head rcu;
++	struct vecls_dev_flow flows[];
++};
++
++struct vecls_sock_flow_table {
++	u32 mask;
++	u32 ents[] ____cacheline_aligned_in_smp;
++};
++
++#define VECLS_DEV_FLOW_TABLE_NUM	0x1000
++#define VECLS_SOCK_FLOW_TABLE_NUM	0x100000
++#define VECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct vecls_dev_flow_table) + \
++		((_num) * sizeof(struct vecls_dev_flow)))
++#define VECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct vecls_sock_flow_table, ents[_num]))
++
++#define ETH_ALL_FLAGS	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
++			  ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
++#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
++			  NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
++			  NETIF_F_RXHASH)
++
++struct rmgr_ctrl {
++	int			driver_select;
++	unsigned long		*slot;
++	__u32			n_rules;
++	__u32			size;
++};
++
++struct cfg_param {
++	struct work_struct work;
++	struct cmd_context ctx;
++	struct sock *sk;
++	bool is_del;
++	int nid;
++	int cpu;
++};
++
++extern int match_ip_flag;
++extern int debug;
++extern int vecls_netdev_num;
++extern int vecls_numa_num;
++
++#define vecls_debug(fmt, ...)					\
++	do {							\
++		if (debug)					\
++			trace_printk(fmt, ## __VA_ARGS__);	\
++	} while (0)
++
++#define vecls_error(fmt, ...) \
++	do { \
++		pr_err_ratelimited("venetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \
++		trace_printk(fmt, ## __VA_ARGS__); \
++	} while (0)
++
++struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index);
++
++#define for_each_vecls_netdev(devid, vecls_dev) \
++	for (devid = 0, vecls_dev = get_vecls_netdev_info(devid); \
++		(devid < vecls_netdev_num) && vecls_dev; \
++		devid++, vecls_dev = get_vecls_netdev_info(devid))
++
++struct vecls_numa_info *get_vecls_numa_info(unsigned int nid);
++
++#define for_each_vecls_numa(nid, numa_info) \
++	for (nid = 0, numa_info = get_vecls_numa_info(nid); \
++		(nid < vecls_numa_num) && numa_info; \
++		nid++, numa_info = get_vecls_numa_info(nid))
++
++#ifdef CONFIG_ARM64_SVE
++void *__memcpy_aarch64_sve(void *, const void *, size_t);
++#define memcpy_r(dst, src, len)					\
++	do {							\
++		if (system_supports_sve())			\
++			__memcpy_aarch64_sve(dst, src, len);	\
++		else						\
++			memcpy(dst, src, len);			\
++	} while (0)
++#else
++#define memcpy_r(dst, src, len) memcpy(dst, src, len)
++#endif
++
++int check_appname(char *task_name);
++int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd);
++int alloc_rxq_id(int nid, int cpu, int devid);
++void free_rxq_id(int nid, int devid, int rxq_id);
++int vecls_ntuple_res_init(void);
++void vecls_ntuple_res_clean(void);
++int venetcls_ntuple_status(struct seq_file *seq, void *v);
++int vecls_flow_res_init(void);
++void vecls_flow_res_clean(void);
++int venetcls_flow_status(struct seq_file *seq, void *v);
++
++#endif	/* _NET_VENETCLS_H */
+diff --git a/net/venetcls/venetcls_flow.c b/net/venetcls/venetcls_flow.c
+new file mode 100644
+index 000000000..f2d7e42ce
+--- /dev/null
++++ b/net/venetcls/venetcls_flow.c
+@@ -0,0 +1,491 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#include <linux/inetdevice.h>
++#include <linux/netdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/irq.h>
++#include <linux/irqdesc.h>
++#include <linux/inet.h>
++#include <linux/venetcls.h>
++#include <net/sock.h>
++
++#include "venetcls.h"
++
++static u32 vecls_cpu_mask;
++static struct vecls_sock_flow_table __rcu *vecls_sock_flow_table;
++static DEFINE_MUTEX(vecls_sock_flow_mutex);
++static DEFINE_SPINLOCK(vecls_dev_flow_lock);
++
++bool is_vecls_config_netdev(const char *name)
++{
++	struct vecls_netdev_info *netdev_info;
++	int netdev_loop;
++
++	for_each_vecls_netdev(netdev_loop, netdev_info)
++		if (strcmp(netdev_info->dev_name, name) == 0)
++			return true;
++
++	return false;
++}
++
++static bool _vecls_timeout(struct net_device *dev, u16 rxq_index,
++			   u32 flow_id, u16 filter_id)
++{
++	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
++	struct vecls_dev_flow_table *flow_table;
++	struct vecls_dev_flow *rflow;
++	bool expire = true;
++	unsigned int cpu;
++
++	rcu_read_lock();
++	flow_table = rcu_dereference(rxqueue->vecls_ftb);
++	if (flow_table && flow_id <= flow_table->mask) {
++		rflow = &flow_table->flows[flow_id];
++		cpu = READ_ONCE(rflow->cpu);
++		if (rflow->filter == filter_id && cpu < nr_cpu_ids) {
++			if (time_before(jiffies, rflow->timeout + VECLS_TIMEOUT)) {
++				expire = false;
++			} else {
++				rflow->isvalid = 0;
++				WRITE_ONCE(rflow->cpu, VECLS_NO_CPU);
++			}
++		}
++	}
++	rcu_read_unlock();
++	if (expire)
++		vecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__,
++			    dev->name, rxq_index, flow_id, filter_id, expire);
++	return expire;
++}
++
++static void _vecls_flow_update(struct sock *sk)
++{
++	struct vecls_sock_flow_table *tb;
++	unsigned int hash, index;
++	u32 val;
++	u32 cpu = raw_smp_processor_id();
++
++	if (sk->sk_state != TCP_ESTABLISHED)
++		return;
++
++	if (check_appname(current->comm))
++		return;
++
++	rcu_read_lock();
++	tb = rcu_dereference(vecls_sock_flow_table);
++	hash = READ_ONCE(sk->sk_rxhash);
++	if (tb && hash) {
++		index = hash & tb->mask;
++		val = hash & ~vecls_cpu_mask;
++		val |= cpu;
++
++		if (READ_ONCE(tb->ents[index]) != val) {
++			WRITE_ONCE(tb->ents[index], val);
++		}
++	}
++	rcu_read_unlock();
++}
++
++static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb)
++{
++	struct vecls_netdev_info *netdev_info;
++	int netdev_loop;
++	u32 hash, index;
++	struct vecls_numa_info *numa_info;
++	struct vecls_numa_bound_dev_info *bound_dev = NULL;
++	int rxq_id, rxq_num, i;
++
++	numa_info = get_vecls_numa_info(nid);
++	if (!numa_info)
++		return -1;
++
++	for_each_vecls_netdev(netdev_loop, netdev_info) {
++		if (strcmp(netdev_info->dev_name, dev->name) == 0) {
++			bound_dev = &numa_info->bound_dev[netdev_loop];
++			break;
++		}
++	}
++
++	if (!bound_dev)
++		return -1;
++	rxq_num = bitmap_weight(bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV);
++	if (rxq_num == 0)
++		return -1;
++
++	hash = skb_get_hash(skb);
++	index = hash % rxq_num;
++
++	i = 0;
++	for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV)
++		if (index == i++)
++			return rxq_id;
++
++	return -1;
++}
++
++static void set_vecls_cpu(struct net_device *dev, struct sk_buff *skb,
++			  struct vecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu)
++{
++	struct netdev_rx_queue *rxqueue;
++	struct vecls_dev_flow_table *dtb;
++	struct vecls_dev_flow *rflow;
++	u32 flow_id, hash;
++	u16 rxq_index;
++	int rc;
++
++	if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
++	    !(dev->features & NETIF_F_NTUPLE))
++		return;
++
++	rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb);
++	if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) {
++		vecls_debug("%s skb:%p, old_rxq:%d, next_cpu:%d new_rxq:%d\n",
++			    __func__, skb, old_rxq_id, next_cpu, rxq_index);
++		return;
++	}
++
++	rxqueue = dev->_rx + rxq_index;
++	dtb = rcu_dereference(rxqueue->vecls_ftb);
++	if (!dtb)
++		return;
++
++	hash = skb_get_hash(skb);
++	flow_id = hash & dtb->mask;
++	rflow = &dtb->flows[flow_id];
++
++	if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(next_cpu)) {
++		rflow->timeout = jiffies;
++		return;
++	}
++
++	rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id);
++	if (rc < 0) {
++		vecls_debug("%s ndo_rx_flow_steer skb:%p rxq:%d hash:0x%x flow_id:%u"
++			    "old_rxq:%d rflow->cpu:%d rflow->isvalid:%d next_cpu:%d rc:%d\n",
++			    __func__, skb, rxq_index, hash, flow_id, old_rxq_id, rflow->cpu,
++			    rflow->isvalid, next_cpu, rc);
++		return;
++	}
++
++	rflow->filter = rc;
++	rflow->isvalid = 1;
++	rflow->timeout = jiffies;
++	if (old_rflow->filter == rflow->filter)
++		old_rflow->filter = VECLS_NO_FILTER;
++	rflow->cpu = next_cpu;
++}
++
++static int get_cpu_in_numa(int tcpu, u32 hash)
++{
++	const struct cpumask *mask;
++	int nr_cpus, cpu, index;
++
++	mask = cpumask_of_node(cpu_to_node(tcpu));
++	nr_cpus = cpumask_weight(mask);
++	if (nr_cpus == 0)
++		return -1;
++
++	index = reciprocal_scale(hash, nr_cpus);
++	if (index < 0)
++		return -1;
++
++	cpu = cpumask_first(mask);
++	while (--nr_cpus > 0) {
++		if (index == 0)
++			break;
++		cpu = cpumask_next(cpu, mask);
++		index--;
++	}
++	return cpu;
++}
++
++static void __vecls_set_cpu(struct sk_buff *skb, struct net_device *ndev,
++			    struct vecls_sock_flow_table *tb, struct vecls_dev_flow_table *dtb,
++			    int old_rxq_id, int *rcpu, int *last_qtail)
++{
++	struct vecls_dev_flow *rflow;
++	u32 last_recv_cpu, hash, val;
++	int tcpu = 0, newcpu;
++	u32 cpu = raw_smp_processor_id();
++
++	skb_reset_network_header(skb);
++	hash = skb_get_hash(skb);
++	if (!hash)
++		return;
++
++	val = READ_ONCE(tb->ents[hash & tb->mask]);
++	last_recv_cpu = val & vecls_cpu_mask;
++	rflow = &dtb->flows[hash & dtb->mask];
++	tcpu = rflow->cpu;
++
++	if ((val ^ hash) & ~vecls_cpu_mask)
++		return;
++
++	newcpu = get_cpu_in_numa(last_recv_cpu, hash);
++	if (newcpu >= 0)
++		*rcpu = newcpu;
++	else
++		newcpu = last_recv_cpu;
++
++	if (cpu_to_node(cpu) == cpu_to_node(newcpu))
++		return;
++
++	if (tcpu >= nr_cpu_ids)
++		set_vecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu);
++}
++
++static void _vecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail)
++{
++	struct net_device *ndev = skb->dev;
++	struct vecls_sock_flow_table *stb;
++	struct vecls_dev_flow_table *dtb;
++	struct netdev_rx_queue *rxqueue;
++	int rxq_id = -1;
++
++	*cpu = -1;
++	last_qtail = 0;//unused
++	if (!ndev)
++		return;
++
++	if (!is_vecls_config_netdev(ndev->name))
++		return;
++
++	rxqueue = ndev->_rx;
++	if (skb_rx_queue_recorded(skb)) {
++		rxq_id = skb_get_rx_queue(skb);
++		if (rxq_id >= ndev->real_num_rx_queues) {
++			vecls_debug("%s ndev:%s rxq:%d real_num:%d\n", __func__,
++				    ndev->name, rxq_id, ndev->real_num_rx_queues);
++			return;
++		}
++		rxqueue += rxq_id;
++	}
++
++	if (rxq_id < 0)
++		return;
++
++	rcu_read_lock();
++	stb = rcu_dereference(vecls_sock_flow_table);
++	dtb = rcu_dereference(rxqueue->vecls_ftb);
++	if (stb && dtb)
++		__vecls_set_cpu(skb, ndev, stb, dtb, rxq_id, cpu, last_qtail);
++	rcu_read_unlock();
++}
++
++static void vecls_dev_flow_table_free(struct rcu_head *rcu)
++{
++	struct vecls_dev_flow_table *table = container_of(rcu,
++			struct vecls_dev_flow_table, rcu);
++	vfree(table);
++}
++
++static void vecls_dev_flow_table_cleanup(struct net_device *netdev, int qid)
++{
++	struct vecls_dev_flow_table *dtb;
++	struct netdev_rx_queue *queue;
++	int i;
++
++	spin_lock(&vecls_dev_flow_lock);
++	for (i = 0; i < qid; i++) {
++		queue = netdev->_rx + i;
++		dtb = rcu_dereference_protected(queue->vecls_ftb,
++						lockdep_is_held(&vecls_dev_flow_lock));
++		rcu_assign_pointer(queue->vecls_ftb, NULL);
++	}
++	spin_unlock(&vecls_dev_flow_lock);
++	call_rcu(&dtb->rcu, vecls_dev_flow_table_free);
++}
++
++static int vecls_dev_flow_table_release(void)
++{
++	struct vecls_netdev_info *netdev_info;
++	int netdev_loop;
++	struct net_device *netdev;
++
++	for_each_vecls_netdev(netdev_loop, netdev_info) {
++		netdev = netdev_info->netdev;
++		if (!netdev)
++			continue;
++		vecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues);
++	}
++
++	return 0;
++}
++
++static int _vecls_dev_flow_table_init(struct net_device *netdev)
++{
++	struct vecls_dev_flow_table *table;
++	int size = VECLS_DEV_FLOW_TABLE_NUM;
++	struct netdev_rx_queue *queue;
++	int i, j, ret = 0;
++
++	size = roundup_pow_of_two(size);
++	vecls_debug("%s dev:%s num_rx_queues:%d mask:0x%x\n",
++		    __func__, netdev->name, netdev->num_rx_queues, size - 1);
++
++	for (i = 0; i < netdev->num_rx_queues; i++) {
++		table = vmalloc(VECLS_DEV_FLOW_TABLE_SIZE(size));
++		if (!table) {
++			ret = -ENOMEM;
++			goto fail;
++		}
++
++		table->mask = size - 1;
++		for (j = 0; j < size; j++) {
++			table->flows[j].cpu = VECLS_NO_CPU;
++			table->flows[j].isvalid = 0;
++		}
++
++		queue = netdev->_rx + i;
++
++		spin_lock(&vecls_dev_flow_lock);
++		rcu_assign_pointer(queue->vecls_ftb, table);
++		spin_unlock(&vecls_dev_flow_lock);
++	}
++	return ret;
++fail:
++	vecls_dev_flow_table_cleanup(netdev, i);
++	return ret;
++}
++
++static int vecls_dev_flow_table_init(void)
++{
++	struct vecls_netdev_info *netdev_info;
++	int netdev_loop;
++	struct net_device *ndev;
++	int i, err;
++
++	for_each_vecls_netdev(netdev_loop, netdev_info) {
++		ndev = netdev_info->netdev;
++		if (!ndev)
++			continue;
++		err = _vecls_dev_flow_table_init(ndev);
++		if (err)
++			goto out;
++	}
++
++	return 0;
++out:
++	for (i = 0; i < netdev_loop; i++) {
++		netdev_info = get_vecls_netdev_info(i);
++		ndev = netdev_info->netdev;
++		if (!ndev)
++			continue;
++		vecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues);
++	}
++	return err;
++}
++
++static const struct vecls_hook_ops vecls_flow_ops = {
++	.vecls_flow_update = _vecls_flow_update,
++	.vecls_set_cpu = _vecls_set_cpu,
++	.vecls_timeout = _vecls_timeout,
++	.vecls_cfg_rxcls = NULL,
++};
++
++static int vecls_sock_flow_table_release(void)
++{
++	struct vecls_sock_flow_table *tb;
++
++	mutex_lock(&vecls_sock_flow_mutex);
++	tb = rcu_dereference_protected(vecls_sock_flow_table,
++				       lockdep_is_held(&vecls_sock_flow_mutex));
++	if (tb)
++		rcu_assign_pointer(vecls_sock_flow_table, NULL);
++	mutex_unlock(&vecls_sock_flow_mutex);
++	synchronize_rcu();
++	vfree(tb);
++
++	return 0;
++}
++
++int venetcls_flow_status(struct seq_file *seq, void *v)
++{
++	struct vecls_netdev_info *netdev_info;
++	struct vecls_dev_flow_table *dtb;
++	struct netdev_rx_queue *queue;
++        struct net_device *netdev;
++        int netdev_loop, i, j;
++
++	seq_printf(seq, "%-16s %-6s %-12s %-12s %-12s\n",
++		   "Interface", "rxq", "flowCPU", "filterId", "timeout");
++	spin_lock(&vecls_dev_flow_lock);
++	for_each_vecls_netdev(netdev_loop, netdev_info) {
++		netdev = netdev_info->netdev;
++		if (!netdev)
++			continue;
++		for (i = 0; i < netdev->num_rx_queues; i++) {
++			queue = netdev->_rx + i;
++			dtb = rcu_dereference_protected(queue->vecls_ftb,
++				lockdep_is_held(&vecls_dev_flow_lock));
++			if (!dtb)
++				continue;
++			for (j = 0; j < VECLS_DEV_FLOW_TABLE_NUM; j++) {
++				if (dtb->flows[j].cpu == VECLS_NO_CPU)
++					continue;
++				if (dtb->flows[j].isvalid == 0)
++					continue;
++				if (time_before(jiffies, dtb->flows[j].timeout + VECLS_TIMEOUT)) {
++					seq_printf(seq, "%-16s %-6d %-12d %-12d %-12u\n", netdev_info->dev_name,
++						   i, dtb->flows[j].cpu, dtb->flows[j].filter,
++						   jiffies_to_msecs(dtb->flows[j].timeout + VECLS_TIMEOUT - jiffies));
++				}
++			}
++		}
++	}
++	spin_unlock(&vecls_dev_flow_lock);
++
++	return 0;
++}
++
++static int vecls_sock_flow_table_init(void)
++{
++	struct vecls_sock_flow_table *table;
++	int size = VECLS_SOCK_FLOW_TABLE_NUM;
++	int i;
++
++	size = roundup_pow_of_two(size);
++	table = vmalloc(VECLS_SOCK_FLOW_TABLE_SIZE(size));
++	if (!table)
++		return -ENOMEM;
++
++	vecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
++	vecls_debug("nr_cpu_ids:%d, vecls_cpu_mask:0x%x\n", nr_cpu_ids, vecls_cpu_mask);
++
++	table->mask = size - 1;
++	for (i = 0; i < size; i++)
++		table->ents[i] = VECLS_NO_CPU;
++
++	mutex_lock(&vecls_sock_flow_mutex);
++	rcu_assign_pointer(vecls_sock_flow_table, table);
++	mutex_unlock(&vecls_sock_flow_mutex);
++
++	return 0;
++}
++
++int vecls_flow_res_init(void)
++{
++	int err;
++
++	err = vecls_sock_flow_table_init();
++	if (err)
++		return err;
++	err = vecls_dev_flow_table_init();
++	if (err)
++		goto clean;
++
++	RCU_INIT_POINTER(vecls_ops, &vecls_flow_ops);
++	synchronize_rcu();
++
++	return 0;
++clean:
++	vecls_sock_flow_table_release();
++	return err;
++}
++
++void vecls_flow_res_clean(void)
++{
++	RCU_INIT_POINTER(vecls_ops, NULL);
++	synchronize_rcu();
++	vecls_sock_flow_table_release();
++	vecls_dev_flow_table_release();
++}
+diff --git a/net/venetcls/venetcls_main.c b/net/venetcls/venetcls_main.c
+new file mode 100644
+index 000000000..80895035f
+--- /dev/null
++++ b/net/venetcls/venetcls_main.c
+@@ -0,0 +1,1086 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#include <linux/module.h>
++#include <linux/netdevice.h>
++#include <linux/netdev_features.h>
++#include <linux/ethtool.h>
++#include <linux/irq.h>
++#include <linux/irqdesc.h>
++#include <linux/proc_fs.h>
++#include <linux/rtnetlink.h>
++#include <linux/seq_file.h>
++#include "venetcls.h"
++
++int vecls_netdev_num;
++static struct vecls_netdev_info vecls_netdev_info_table[VECLS_MAX_NETDEV_NUM];
++
++int vecls_numa_num;
++static int vecls_cluster_cpu_num, vecls_cluster_per_numa;
++static struct vecls_numa_info *vecls_numa_info_table;
++
++int debug;
++module_param(debug, int, 0644);
++MODULE_PARM_DESC(debug, "debug switch");
++
++static int mode;
++module_param(mode, int, 0444);
++MODULE_PARM_DESC(mode, "mode, default 0");
++
++static char ifname[64] = { 0 };
++module_param_string(ifname, ifname, sizeof(ifname), 0444);
++MODULE_PARM_DESC(ifname, "ifname");
++
++static char appname[64] = "redis-server";
++module_param_string(appname, appname, sizeof(appname), 0644);
++MODULE_PARM_DESC(appname, "appname, default redis-server");
++
++int match_ip_flag = 1;
++module_param(match_ip_flag, int, 0644);
++MODULE_PARM_DESC(match_ip_flag, "match ip flag");
++
++static int strategy;
++module_param(strategy, int, 0444);
++MODULE_PARM_DESC(strategy, "strategy, default 0");
++
++static bool check_params(void)
++{
++	if (mode != 0 && mode != 1)
++		return false;
++
++	if (strlen(ifname) == 0)
++		return false;
++
++	return true;
++}
++
++int check_appname(char *task_name)
++{
++	char *start = appname, *end;
++
++	if (!strlen(appname))
++		return 0;
++
++	// support appname: app1#app2#appN
++	while (*start != '\0') {
++		end = strchr(start, '#');
++		if (end == start) {
++			start++;
++			continue;
++		}
++
++		if (!end) {
++			if (!strncmp(task_name, start, strlen(start)))
++				return 0;
++			break;
++		}
++
++		if (!strncmp(task_name, start, end - start))
++			return 0;
++		start = end + 1;
++	}
++	return -EOPNOTSUPP;
++}
++
++static u32 __ethtool_get_flags(struct net_device *dev)
++{
++	u32 flags = 0;
++
++	if (dev->features & NETIF_F_LRO)
++		flags |= ETH_FLAG_LRO;
++	if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
++		flags |= ETH_FLAG_RXVLAN;
++	if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
++		flags |= ETH_FLAG_TXVLAN;
++	if (dev->features & NETIF_F_NTUPLE)
++		flags |= ETH_FLAG_NTUPLE;
++	if (dev->features & NETIF_F_RXHASH)
++		flags |= ETH_FLAG_RXHASH;
++
++	return flags;
++}
++
++static int __ethtool_set_flags(struct net_device *dev, u32 data)
++{
++	netdev_features_t features = 0, changed;
++
++	if (data & ~ETH_ALL_FLAGS)
++		return -EINVAL;
++
++	if (data & ETH_FLAG_LRO)
++		features |= NETIF_F_LRO;
++	if (data & ETH_FLAG_RXVLAN)
++		features |= NETIF_F_HW_VLAN_CTAG_RX;
++	if (data & ETH_FLAG_TXVLAN)
++		features |= NETIF_F_HW_VLAN_CTAG_TX;
++	if (data & ETH_FLAG_NTUPLE)
++		features |= NETIF_F_NTUPLE;
++	if (data & ETH_FLAG_RXHASH)
++		features |= NETIF_F_RXHASH;
++
++	/* allow changing only bits set in hw_features */
++	changed = (features ^ dev->features) & ETH_ALL_FEATURES;
++	if (changed & ~dev->hw_features)
++		return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
++
++	dev->wanted_features =
++		(dev->wanted_features & ~changed) | (features & changed);
++
++	__netdev_update_features(dev);
++
++	return 0;
++}
++
++static void ethtool_rxnfc_copy_to_user(void *useraddr,
++				       const struct ethtool_rxnfc *rxnfc,
++				       size_t size, const u32 *rule_buf)
++{
++	memcpy_r(useraddr, rxnfc, size);
++	useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
++
++	if (rule_buf)
++		memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32));
++}
++
++static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
++						u32 cmd, void *useraddr)
++{
++	struct ethtool_rxnfc info;
++	size_t info_size = sizeof(info);
++	int rc;
++
++	if (!dev->ethtool_ops->set_rxnfc)
++		return -EOPNOTSUPP;
++
++	if (cmd == ETHTOOL_SRXFH)
++		info_size = (offsetof(struct ethtool_rxnfc, data) +
++			     sizeof(info.data));
++
++	memcpy_r(&info, useraddr, info_size);
++	rc = dev->ethtool_ops->set_rxnfc(dev, &info);
++	if (rc)
++		return rc;
++
++	if (cmd == ETHTOOL_SRXCLSRLINS)
++		ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
++
++	return 0;
++}
++
++static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
++						u32 cmd, void *useraddr)
++{
++	struct ethtool_rxnfc info;
++	size_t info_size = sizeof(info);
++	const struct ethtool_ops *ops = dev->ethtool_ops;
++	int ret;
++	void *rule_buf = NULL;
++
++	if (!ops->get_rxnfc)
++		return -EOPNOTSUPP;
++
++	if (cmd == ETHTOOL_GRXFH)
++		info_size = (offsetof(struct ethtool_rxnfc, data) +
++			     sizeof(info.data));
++
++	memcpy_r(&info, useraddr, info_size);
++
++	/* If FLOW_RSS was requested then user-space must be using the
++	 * new definition, as FLOW_RSS is newer.
++	 */
++	if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
++		info_size = sizeof(info);
++		memcpy_r(&info, useraddr, info_size);
++		/* Since malicious users may modify the original data,
++		 * we need to check whether FLOW_RSS is still requested.
++		 */
++		if (!(info.flow_type & FLOW_RSS))
++			return -EINVAL;
++	}
++
++	if (info.cmd != cmd)
++		return -EINVAL;
++
++	if (info.cmd == ETHTOOL_GRXCLSRLALL) {
++		if (info.rule_cnt > 0) {
++			if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
++				rule_buf = kcalloc(info.rule_cnt, sizeof(u32),
++						   GFP_KERNEL);
++			if (!rule_buf)
++				return -ENOMEM;
++		}
++	}
++
++	ret = ops->get_rxnfc(dev, &info, rule_buf);
++	if (ret < 0)
++		goto err_out;
++
++	ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf);
++err_out:
++	kfree(rule_buf);
++
++	return ret;
++}
++
++static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
++						   void *useraddr)
++{
++	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
++
++	if (!dev->ethtool_ops->get_channels)
++		return -EOPNOTSUPP;
++
++	dev->ethtool_ops->get_channels(dev, &channels);
++
++	memcpy_r(useraddr, &channels, sizeof(channels));
++	return 0;
++}
++
++static int ethtool_get_value(struct net_device *dev, char *useraddr,
++			     u32 cmd, u32 (*actor)(struct net_device *))
++{
++	struct ethtool_value edata = { .cmd = cmd };
++
++	if (!actor)
++		return -EOPNOTSUPP;
++
++	edata.data = actor(dev);
++
++	memcpy_r(useraddr, &edata, sizeof(edata));
++	return 0;
++}
++
++static int ethtool_set_value(struct net_device *dev, char *useraddr,
++			     int (*actor)(struct net_device *, u32))
++{
++	struct ethtool_value edata;
++
++	if (!actor)
++		return -EOPNOTSUPP;
++
++	memcpy_r(&edata, useraddr, sizeof(edata));
++
++	return actor(dev, edata.data);
++}
++
++static int dev_ethtool_kern(struct net *net, struct ifreq *ifr)
++{
++	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
++	void *useraddr = ifr->ifr_data;
++	u32 ethcmd, sub_cmd;
++	int rc;
++	netdev_features_t old_features;
++
++	if (!dev || !netif_device_present(dev))
++		return -ENODEV;
++
++	memcpy_r(&ethcmd, useraddr, sizeof(ethcmd));
++
++	if (ethcmd == ETHTOOL_PERQUEUE)
++		memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd));
++	else
++		sub_cmd = ethcmd;
++
++	if (dev->ethtool_ops->begin) {
++		rc = dev->ethtool_ops->begin(dev);
++		if (rc  < 0)
++			return rc;
++	}
++	old_features = dev->features;
++
++	switch (ethcmd) {
++	case ETHTOOL_GFLAGS:
++		rc = ethtool_get_value(dev, useraddr, ethcmd,
++				       __ethtool_get_flags);
++		break;
++	case ETHTOOL_SFLAGS:
++		rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
++		break;
++	case ETHTOOL_GRXFH:
++	case ETHTOOL_GRXRINGS:
++	case ETHTOOL_GRXCLSRLCNT:
++	case ETHTOOL_GRXCLSRULE:
++	case ETHTOOL_GRXCLSRLALL:
++		rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
++		break;
++	case ETHTOOL_SRXFH:
++	case ETHTOOL_SRXCLSRLDEL:
++	case ETHTOOL_SRXCLSRLINS:
++		rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
++		break;
++	case ETHTOOL_GCHANNELS:
++		rc = ethtool_get_channels(dev, useraddr);
++		break;
++	default:
++		rc = -EOPNOTSUPP;
++	}
++
++	if (dev->ethtool_ops->complete)
++		dev->ethtool_ops->complete(dev);
++
++	if (old_features != dev->features)
++		netdev_features_change(dev);
++
++	return rc;
++}
++
++int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd)
++{
++	struct ifreq ifr = {0};
++	int ret;
++
++	strncpy(ifr.ifr_name, ctx->netdev, IFNAMSIZ);
++	ifr.ifr_data = cmd;
++
++	rtnl_lock();
++	ret = dev_ethtool_kern(&init_net, &ifr);
++	rtnl_unlock();
++
++	return ret;
++}
++
++struct vecls_netdev_info *get_vecls_netdev_info(unsigned int index)
++{
++	if (index >= VECLS_MAX_NETDEV_NUM)
++		return NULL;
++	return &vecls_netdev_info_table[index];
++}
++
++static struct vecls_netdev_info *alloc_vecls_netdev_info(void)
++{
++	if (vecls_netdev_num >= VECLS_MAX_NETDEV_NUM)
++		return NULL;
++
++	return &vecls_netdev_info_table[vecls_netdev_num++];
++}
++
++static bool check_irq_name(const char *irq_name, struct vecls_netdev_info *vecls_dev)
++{
++	if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx"))
++		return false;
++
++	if (strstr(irq_name, vecls_dev->dev_name))
++		return true;
++
++	if (vecls_dev->netdev->dev.parent &&
++	    strstr(irq_name, dev_name(vecls_dev->netdev->dev.parent)))
++		return true;
++
++	return false;
++}
++
++static void get_netdev_queue_info(struct vecls_netdev_info *vecls_dev)
++{
++	struct vecls_netdev_queue_info *rxq_info;
++	struct irq_desc *desc;
++	int irq, cpu;
++
++	for_each_irq_desc(irq, desc) {
++		if (!desc->action)
++			continue;
++		if (!desc->action->name)
++			continue;
++		if (!check_irq_name(desc->action->name, vecls_dev))
++			continue;
++		if (vecls_dev->rxq_num >= VECLS_MAX_RXQ_NUM_PER_DEV)
++			break;
++		rxq_info = &vecls_dev->rxq[vecls_dev->rxq_num++];
++		rxq_info->irq = irq;
++		cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data));
++		rxq_info->affinity_cpu = cpu;
++		vecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n",
++			    irq, desc->action->name, vecls_dev->rxq_num, cpu);
++	}
++}
++
++static int vecls_filter_enable(const char *dev_name, bool *old_state)
++{
++	struct ethtool_value eval = {0};
++	struct cmd_context ctx = {0};
++	int ret;
++
++	strncpy(ctx.netdev, dev_name, IFNAMSIZ);
++
++	eval.cmd = ETHTOOL_GFLAGS;
++	ret = send_ethtool_ioctl(&ctx, &eval);
++	if (ret != 0) {
++		vecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
++		return ret;
++	}
++	if (eval.data & ETH_FLAG_NTUPLE) {
++		*old_state = true;
++		vecls_debug("%s ntuple is already on\n", dev_name);
++		return 0;
++	}
++
++	// Set ntuple feature
++	eval.cmd = ETHTOOL_SFLAGS;
++	eval.data |= ETH_FLAG_NTUPLE;
++	ret = send_ethtool_ioctl(&ctx, &eval);
++	if (ret != 0) {
++		vecls_error("set %s flags fail, ret:%d\n", dev_name, ret);
++		return ret;
++	}
++
++	// Get ntuple feature
++	eval.cmd = ETHTOOL_GFLAGS;
++	eval.data = 0;
++	ret = send_ethtool_ioctl(&ctx, &eval);
++	if (ret != 0) {
++		vecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
++		return ret;
++	}
++	if (!(eval.data & ETH_FLAG_NTUPLE)) {
++		vecls_error("enable ntuple feature fail!\n");
++		return -EOPNOTSUPP;
++	}
++
++	return 0;
++}
++
++static void vecls_filter_restore(const char *dev_name, bool old_state)
++{
++	struct ethtool_value eval = {0};
++	struct cmd_context ctx = {0};
++	bool cur_filter_state;
++	int ret;
++
++	strncpy(ctx.netdev, dev_name, IFNAMSIZ);
++
++	eval.cmd = ETHTOOL_GFLAGS;
++	ret = send_ethtool_ioctl(&ctx, &eval);
++	if (ret != 0) {
++		vecls_error("get %s flags fail, ret:%d\n", dev_name, ret);
++		return;
++	}
++
++	cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false;
++	if (cur_filter_state == old_state)
++		return;
++
++	// Set ntuple feature
++	eval.cmd = ETHTOOL_SFLAGS;
++	if (old_state)
++		eval.data |= ETH_FLAG_NTUPLE;
++	else
++		eval.data &= ~ETH_FLAG_NTUPLE;
++	ret = send_ethtool_ioctl(&ctx, &eval);
++	if (ret != 0) {
++		vecls_error("set %s flags fail, ret:%d\n", dev_name, ret);
++		return;
++	}
++}
++
++static int init_single_vecls_dev(char *if_name, unsigned int length)
++{
++	struct vecls_netdev_info *vecls_dev;
++	char dev_name[IFNAMSIZ] = { 0 };
++	struct net_device *netdev;
++	int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ;
++	bool old_state = false;
++	int ret;
++
++	strncpy(dev_name, if_name, cpy_len);
++	netdev = dev_get_by_name(&init_net, dev_name);
++	if (!netdev) {
++		vecls_error("dev [%s] is not exist!\n", dev_name);
++		return -ENODEV;
++	}
++
++	if (!(netdev->flags & IFF_UP)) {
++		ret = -ENETDOWN;
++		vecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags);
++		goto out;
++	}
++
++	if (netdev->flags & IFF_LOOPBACK) {
++		ret = -EOPNOTSUPP;
++		vecls_error("Do not support loopback.\n");
++		goto out;
++	}
++
++	ret = vecls_filter_enable(dev_name, &old_state);
++	if (ret) {
++		vecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret);
++		goto out;
++	}
++
++	vecls_dev = alloc_vecls_netdev_info();
++	if (!vecls_dev) {
++		ret = -ENOMEM;
++		vecls_filter_restore(dev_name, old_state);
++		vecls_error("alloc vecls_dev fail! vecls_netdev_num:%d\n", vecls_netdev_num);
++		goto out;
++	}
++
++	memcpy_r(vecls_dev->dev_name, dev_name, IFNAMSIZ);
++	vecls_dev->old_filter_state = old_state;
++	vecls_dev->netdev = netdev;
++	get_netdev_queue_info(vecls_dev);
++	return 0;
++
++out:
++	dev_put(netdev);
++	return ret;
++}
++
++static void clean_vecls_netdev_info(void)
++{
++	struct vecls_netdev_info *vecls_dev;
++	struct net_device *netdev;
++	int devid;
++
++	for_each_vecls_netdev(devid, vecls_dev) {
++		vecls_filter_restore(vecls_dev->dev_name, vecls_dev->old_filter_state);
++		netdev = vecls_dev->netdev;
++		if (netdev) {
++			vecls_dev->netdev = NULL;
++			dev_put(netdev);
++		}
++	}
++
++	vecls_netdev_num = 0;
++}
++
++static int init_vecls_netdev_info(char *netdev_str)
++{
++	char *start = netdev_str, *end;
++	int err = -ENODEV;
++
++	while (*start != '\0') {
++		// skip start #
++		end = strchr(start, '#');
++		if (end == start) {
++			start++;
++			continue;
++		}
++
++		// find the last ifname
++		if (!end) {
++			err = init_single_vecls_dev(start, strlen(start));
++			break;
++		}
++
++		err = init_single_vecls_dev(start, end - start);
++		if (err)
++			break;
++		start = end + 1;
++	}
++
++	return err;
++}
++
++struct vecls_numa_info *get_vecls_numa_info(unsigned int nid)
++{
++	if (nid >= vecls_numa_num)
++		return NULL;
++	return &vecls_numa_info_table[nid];
++}
++
++static void clean_vecls_numa_info(void)
++{
++	vecls_numa_num = 0;
++	kfree(vecls_numa_info_table);
++}
++
++static void init_numa_avail_cpus(int nid, struct vecls_numa_info *numa_info)
++{
++	int cpu;
++
++	vecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)),
++		    cpumask_pr_args(cpumask_of_node(nid)));
++
++	bitmap_zero(numa_info->avail_cpus, VECLS_MAX_CPU_NUM);
++	for_each_cpu(cpu, cpumask_of_node(nid)) {
++		if (cpu >= VECLS_MAX_CPU_NUM)
++			return;
++		set_bit(cpu, numa_info->avail_cpus);
++	}
++}
++
++static void clean_vecls_rxq(void)
++{
++	struct vecls_numa_bound_dev_info *bound_dev;
++	struct vecls_netdev_info *vecls_dev;
++	struct vecls_numa_info *numa_info;
++	int nid, devid;
++
++	for_each_vecls_numa(nid, numa_info) {
++		for_each_vecls_netdev(devid, vecls_dev) {
++			bound_dev = &numa_info->bound_dev[devid];
++			kfree(bound_dev->cluster_info);
++		}
++	}
++}
++
++static int init_numa_rxq_bitmap(int nid, struct vecls_numa_info *numa_info)
++{
++	int bound_rxq_num, cluster_id, cluster_idx, cur_idx;
++	struct vecls_numa_bound_dev_info *bound_dev;
++	struct vecls_netdev_info *vecls_dev;
++	int rxq_id, devid, cpu, ret = 0;
++
++	for_each_vecls_netdev(devid, vecls_dev) {
++		bound_rxq_num = 0;
++		bound_dev = &numa_info->bound_dev[devid];
++		bitmap_zero(bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV);
++		bound_dev->cluster_info = kzalloc(sizeof(struct vecls_numa_clusterinfo)
++						  * vecls_cluster_per_numa, GFP_ATOMIC);
++		if (!bound_dev->cluster_info) {
++			ret = -ENOMEM;
++			goto out;
++		}
++
++		for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) {
++			cpu = vecls_dev->rxq[rxq_id].affinity_cpu;
++			if (cpu_to_node(cpu) == nid) {
++				set_bit(rxq_id, bound_dev->bitmap_rxq);
++				cluster_id = cpu / vecls_cluster_cpu_num;
++				cluster_idx = cluster_id % vecls_cluster_per_numa;
++				bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id;
++				cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++;
++				bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id;
++				bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 1;
++				bound_rxq_num++;
++				vecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n",
++					    cpu, cluster_id, cluster_idx, rxq_id, cur_idx);
++			}
++		}
++
++		vecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n",
++			    nid, devid, vecls_dev->dev_name, vecls_dev->rxq_num,
++			    bound_rxq_num, VECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq);
++	}
++	return ret;
++
++out:
++	clean_vecls_rxq();
++	return ret;
++}
++
++static int get_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int cpu)
++{
++	int cluster_id = cpu / vecls_cluster_cpu_num;
++	int i, j, rxq_id;
++
++	for (i = 0; i < vecls_cluster_per_numa; i++) {
++		if (cluster_id != bound_dev->cluster_info[i].cluster_id)
++			continue;
++		for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) {
++			if (bound_dev->cluster_info[i].rxqs[j].status == 1) {
++				bound_dev->cluster_info[i].rxqs[j].status = 2;
++				rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id;
++				vecls_debug("cluster:%d cpu:%d alloc rxq_id:%d\n",
++					    cluster_id, cpu, rxq_id);
++				return rxq_id;
++			}
++		}
++	}
++	vecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu);
++	return -1;
++}
++
++static int put_cluster_rxq(struct vecls_numa_bound_dev_info *bound_dev, int rxq_id)
++{
++	int i, j;
++
++	for (i = 0; i < vecls_cluster_per_numa; i++) {
++		for (j = 0; j < VECLS_MAX_RXQ_NUM_PER_DEV; j++) {
++			if (bound_dev->cluster_info[i].rxqs[j].status == 2 &&
++			    bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) {
++				bound_dev->cluster_info[i].rxqs[j].status = 1;
++				vecls_debug("free rxq_id:%d\n", rxq_id);
++				return 0;
++			}
++		}
++	}
++	vecls_debug("no match malloced rxq_id:%d\n", rxq_id);
++	return -1;
++}
++
++int alloc_rxq_id(int nid, int cpu, int devid)
++{
++	struct vecls_numa_bound_dev_info *bound_dev;
++	struct vecls_numa_info *numa_info;
++	int rxq_id;
++
++	numa_info = get_vecls_numa_info(nid);
++	if (!numa_info) {
++		vecls_error("error nid:%d\n", nid);
++		return -EINVAL;
++	}
++
++	if (devid >= VECLS_MAX_NETDEV_NUM) {
++		vecls_error("error bound_dev index:%d\n", devid);
++		return -EINVAL;
++	}
++	bound_dev = &numa_info->bound_dev[devid];
++
++	if (strategy == 1) {
++		rxq_id = get_cluster_rxq(bound_dev, cpu);
++		if (rxq_id < 0 || rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV)
++			vecls_debug("failed to get rxq_id:%d in cluster, try numa\n", rxq_id);
++		else
++			goto found;
++	}
++
++	rxq_id = find_first_bit(bound_dev->bitmap_rxq, VECLS_MAX_RXQ_NUM_PER_DEV);
++	if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) {
++		vecls_error("error rxq_id:%d\n", rxq_id);
++		return -EINVAL;
++	}
++
++found:
++	clear_bit(rxq_id, bound_dev->bitmap_rxq);
++	vecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id);
++	return rxq_id;
++}
++
++void free_rxq_id(int nid, int devid, int rxq_id)
++{
++	struct vecls_numa_bound_dev_info *bound_dev;
++	struct vecls_numa_info *numa_info;
++
++	numa_info = get_vecls_numa_info(nid);
++	if (!numa_info) {
++		vecls_error("error nid:%d\n", nid);
++		return;
++	}
++
++	if (devid >= VECLS_MAX_NETDEV_NUM) {
++		vecls_error("error bound_dev index:%d\n", devid);
++		return;
++	}
++	bound_dev = &numa_info->bound_dev[devid];
++
++	if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV) {
++		vecls_error("error rxq_id:%d\n", rxq_id);
++		return;
++	}
++
++	if (strategy == 1)
++		put_cluster_rxq(bound_dev, rxq_id);
++
++	if (test_bit(rxq_id, bound_dev->bitmap_rxq)) {
++		vecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id);
++		return;
++	}
++
++	set_bit(rxq_id, bound_dev->bitmap_rxq);
++	vecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id);
++}
++
++static int init_vecls_numa_info(void)
++{
++	struct vecls_numa_info *numa_info;
++	int nid, ret = 0;
++
++	vecls_numa_num = num_online_nodes();
++	vecls_numa_info_table = kzalloc(sizeof(struct vecls_numa_info) * vecls_numa_num, GFP_ATOMIC);
++	if (!vecls_numa_info_table) {
++		ret = -ENOMEM;
++		vecls_error("vecls_numa_info_table alloc failed:%d\n", ret);
++		return ret;
++	}
++
++	vecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(raw_smp_processor_id()));
++	vecls_cluster_per_numa = (nr_cpu_ids / vecls_cluster_cpu_num) / vecls_numa_num;
++	vecls_debug("vecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n",
++		    vecls_numa_num, vecls_cluster_per_numa, vecls_cluster_cpu_num);
++
++	for_each_vecls_numa(nid, numa_info)
++		init_numa_avail_cpus(nid, numa_info);
++
++	return ret;
++}
++
++static int alloc_available_cpu(int nid, struct vecls_numa_info *numa_info)
++{
++	int cpu;
++
++	cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM);
++	if (cpu >= VECLS_MAX_CPU_NUM) {
++		vecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu);
++		return -1;
++	}
++
++	clear_bit(cpu, numa_info->avail_cpus);
++	return cpu;
++}
++
++static void add_netdev_irq_affinity_cpu(struct vecls_netdev_info *vecls_dev, int rxq_id, int cpu)
++{
++	struct vecls_netdev_queue_info *rxq_info;
++
++	if (rxq_id >= VECLS_MAX_RXQ_NUM_PER_DEV)
++		return;
++
++	rxq_info = &vecls_dev->rxq[rxq_id];
++	rxq_info->affinity_cpu = cpu;
++}
++
++static void config_affinity_strategy_default(struct vecls_netdev_info *vecls_dev)
++{
++	struct vecls_numa_info *numa_info;
++	int rxq_num = vecls_dev->rxq_num;
++	int rxq_per_numa = rxq_num / vecls_numa_num;
++	int remain = rxq_num - rxq_per_numa * vecls_numa_num;
++	int numa_rxq_id, rxq_id, nid, cpu;
++
++	vecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", vecls_dev->dev_name,
++		    rxq_num, rxq_per_numa, remain);
++
++	// average config rxq to every numa
++	for_each_vecls_numa(nid, numa_info) {
++		for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) {
++			cpu = alloc_available_cpu(nid, numa_info);
++			if (cpu < 0)
++				break;
++
++			rxq_id = rxq_per_numa * nid + numa_rxq_id;
++			add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu);
++		}
++	}
++
++	if (!remain)
++		return;
++
++	// config remain rxq to every numa
++	numa_rxq_id = 0;
++	for_each_vecls_numa(nid, numa_info) {
++		if (numa_rxq_id >= remain)
++			break;
++		cpu = alloc_available_cpu(nid, numa_info);
++		if (cpu < 0)
++			break;
++
++		rxq_id = rxq_per_numa * vecls_numa_num + numa_rxq_id;
++		numa_rxq_id++;
++		add_netdev_irq_affinity_cpu(vecls_dev, rxq_id, cpu);
++	}
++}
++
++static void config_affinity_strategy_cluster(struct vecls_netdev_info *vecls_dev)
++{
++	int rxq_num = vecls_dev->rxq_num;
++	int rxq_per_numa = rxq_num / vecls_numa_num;
++	int remain = rxq_num - rxq_per_numa * vecls_numa_num;
++	int cpu_idx = vecls_cluster_cpu_num - 1;
++	int cluster, cpu, rxq_id = 0, round;
++
++	round = rxq_per_numa < vecls_cluster_per_numa ? rxq_per_numa : vecls_cluster_per_numa;
++	if (remain > 0)
++		round++;
++	vecls_debug("round=%d\n", round);
++
++	while (rxq_id < vecls_dev->rxq_num) {
++		for (cluster = 0; cluster < vecls_cluster_per_numa * vecls_numa_num; cluster++) {
++			if (cluster % vecls_cluster_per_numa >= round)
++				continue;
++			cpu = cluster * vecls_cluster_cpu_num + cpu_idx;
++			if (rxq_id >= vecls_dev->rxq_num)
++				break;
++			add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu);
++		}
++		cpu_idx--;
++		if (--cpu_idx < 0)
++			cpu_idx = vecls_cluster_cpu_num - 1;
++	}
++}
++
++static void config_affinity_strategy_numa(struct vecls_netdev_info *vecls_dev)
++{
++	int rxq_num = vecls_dev->rxq_num;
++	int rxq_per_numa = rxq_num / vecls_numa_num;
++	int cpu_per_numa = nr_cpu_ids / vecls_numa_num;
++	int remain = rxq_num - rxq_per_numa * vecls_numa_num;
++	struct vecls_numa_info *numa_info;
++	int numa_start_cpu, numa_cpu_id;
++	int rxq_id = 0, nid, cpu;
++
++	for_each_vecls_numa(nid, numa_info) {
++		numa_start_cpu = find_first_bit(numa_info->avail_cpus, VECLS_MAX_CPU_NUM);
++		for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) {
++			cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa);
++			if (rxq_id >= vecls_dev->rxq_num)
++				break;
++			add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu);
++		}
++		if (remain-- > 0) {
++			cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa);
++			add_netdev_irq_affinity_cpu(vecls_dev, rxq_id++, cpu);
++		}
++	}
++}
++
++static void config_affinity_strategy_custom(struct vecls_netdev_info *vecls_dev)
++{
++	vecls_debug("dev=%s\n", vecls_dev->dev_name);
++}
++
++static void config_affinity_strategy(void)
++{
++	struct vecls_netdev_info *vecls_dev;
++	int devid;
++
++	for_each_vecls_netdev(devid, vecls_dev) {
++		switch (strategy) {
++		case 1:
++			config_affinity_strategy_cluster(vecls_dev);
++			break;
++		case 2:
++			config_affinity_strategy_numa(vecls_dev);
++			break;
++		case 3:
++			config_affinity_strategy_custom(vecls_dev);
++			break;
++		case 0:
++		default:
++			config_affinity_strategy_default(vecls_dev);
++			break;
++		}
++	}
++}
++
++static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu)
++{
++	int err = 0;
++
++	err = irq_set_affinity(irq, get_cpu_mask(cpu));
++	vecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err);
++}
++
++static void enable_affinity_strategy(void)
++{
++	struct vecls_netdev_queue_info *rxq_info;
++	struct vecls_netdev_info *vecls_dev;
++	int rxq_id, devid;
++
++	for_each_vecls_netdev(devid, vecls_dev) {
++		for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) {
++			rxq_info = &vecls_dev->rxq[rxq_id];
++			irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu);
++		}
++	}
++}
++
++static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id,
++					       const struct cpumask *cpu_mask)
++{
++	int err = 0;
++
++	err = netif_set_xps_queue(netdev, cpu_mask, rxq_id);
++	vecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id,
++		    cpumask_pr_args(cpu_mask), err);
++}
++
++static void set_netdev_xps_queue(bool enable)
++{
++	const struct cpumask clear_mask = { 0 };
++	struct vecls_netdev_info *vecls_dev;
++	const struct cpumask *cpu_mask;
++	int rxq_id, devid, cpu, nid;
++
++	for_each_vecls_netdev(devid, vecls_dev) {
++		for (rxq_id = 0; rxq_id < vecls_dev->rxq_num; rxq_id++) {
++			cpu = vecls_dev->rxq[rxq_id].affinity_cpu;
++			nid = cpu_to_node(cpu);
++			if (enable)
++				cpu_mask = cpumask_of_node(nid);
++			else
++				cpu_mask = &clear_mask;
++
++			netif_set_xps_queue_wrapper(vecls_dev->netdev, rxq_id, cpu_mask);
++		}
++	}
++}
++
++static int __maybe_unused venetcls_status_seq_show(struct seq_file *seq, void *v)
++{
++	int err;
++
++	if (mode == 0)
++		err = venetcls_ntuple_status(seq, v);
++	else
++		err = venetcls_flow_status(seq, v);
++	return err;
++}
++
++static __init int vecls_init(void)
++{
++	struct vecls_numa_info *numa_info;
++	int nid, err;
++
++	if (!check_params())
++		return -EINVAL;
++
++	err = init_vecls_numa_info();
++	if (err)
++		return err;
++
++	err = init_vecls_netdev_info(ifname);
++	if (err)
++		goto clean_numa;
++
++	// Set irq affinity
++	config_affinity_strategy();
++	enable_affinity_strategy();
++
++	// Calculate rxq bounded to one numa
++	for_each_vecls_numa(nid, numa_info) {
++		err = init_numa_rxq_bitmap(nid, numa_info);
++		if (err)
++			goto clean_rxq;
++	}
++
++#ifdef CONFIG_XPS
++	set_netdev_xps_queue(true);
++#endif
++
++	if (mode == 0)
++		err = vecls_ntuple_res_init();
++	else
++		err = vecls_flow_res_init();
++
++	if (err)
++		goto clean_rxq;
++
++#ifdef CONFIG_PROC_FS
++	if (!proc_create_net_single("venet_status", 0444, init_net.proc_net,
++				venetcls_status_seq_show, NULL)) {
++		err = -ENOMEM;
++		goto clean_rxq;
++	}
++#endif
++
++	return 0;
++
++clean_rxq:
++clean_numa:
++	clean_vecls_netdev_info();
++	clean_vecls_numa_info();
++	return err;
++}
++
++static __exit void vecls_exit(void)
++{
++#ifdef CONFIG_PROC_FS
++	remove_proc_entry("venet_status", init_net.proc_net);
++#endif
++	if (mode == 0)
++		vecls_ntuple_res_clean();
++	else
++		vecls_flow_res_clean();
++
++#ifdef CONFIG_XPS
++	set_netdev_xps_queue(false);
++#endif
++
++	clean_vecls_rxq();
++	clean_vecls_netdev_info();
++	clean_vecls_numa_info();
++}
++
++module_init(vecls_init);
++module_exit(vecls_exit);
++
++MODULE_DESCRIPTION("venetcls");
++MODULE_LICENSE("GPL v2");
+diff --git a/net/venetcls/venetcls_ntuple.c b/net/venetcls/venetcls_ntuple.c
+new file mode 100644
+index 000000000..135e2e049
+--- /dev/null
++++ b/net/venetcls/venetcls_ntuple.c
+@@ -0,0 +1,643 @@
++// SPDX-License-Identifier: GPL-2.0-only
++#include <linux/inetdevice.h>
++#include <linux/ethtool.h>
++#include <linux/netdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/irq.h>
++#include <linux/irqdesc.h>
++#include <linux/inet.h>
++#include <linux/jhash.h>
++#include <linux/venetcls.h>
++#include <net/sock.h>
++
++#include "venetcls.h"
++
++struct vecls_sk_rule_list vecls_sk_rules, vecls_sk_list;
++static struct workqueue_struct *do_cfg_workqueue;
++static atomic_t vecls_worker_count = ATOMIC_INIT(0);
++
++static void init_vecls_sk_rules(void)
++{
++	unsigned int i;
++
++	for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++)
++		INIT_HLIST_HEAD(vecls_sk_rules.hash + i);
++	mutex_init(&vecls_sk_rules.mutex);
++}
++
++static inline struct hlist_head *get_rule_hashlist(u32 dip4, u16 dport)
++{
++	return vecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & VECLS_SK_RULE_HASHMASK);
++}
++
++static inline struct hlist_head *get_sk_hashlist(void *sk)
++{
++	return vecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & VECLS_SK_RULE_HASHMASK);
++}
++
++static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action,
++			int ruleid, int nid)
++{
++	struct hlist_head *hlist = get_rule_hashlist(dip4, dport);
++	struct hlist_head *sk_hlist = get_sk_hashlist(sk);
++	struct vecls_sk_rule *rule;
++	struct vecls_sk_entry *entry;
++
++	rule = kzalloc(sizeof(struct vecls_sk_rule), GFP_ATOMIC);
++	entry = kzalloc(sizeof(struct vecls_sk_entry), GFP_ATOMIC);
++	if (!rule || !entry)
++		goto out;
++
++	rule->sk = sk;
++	rule->dip4 = dip4;
++	rule->dport = dport;
++	rule->devid = devid;
++	rule->action = action;
++	rule->ruleid = ruleid;
++	rule->nid = nid;
++	hlist_add_head(&rule->node, hlist);
++
++	entry->sk = sk;
++	entry->sk_rule_hash = jhash_2words(dip4, dport, 0);
++	hlist_add_head(&entry->node, sk_hlist);
++	return;
++out:
++	vecls_debug("alloc failed rule:%p entry:%p\n", rule, entry);
++	kfree(entry);
++	kfree(rule);
++}
++
++static struct vecls_sk_entry *get_sk_entry(void *sk)
++{
++	struct hlist_head *sk_hlist = get_sk_hashlist(sk);
++	struct vecls_sk_entry *entry = NULL;
++
++	hlist_for_each_entry(entry, sk_hlist, node) {
++		if (entry->sk == sk)
++			break;
++	}
++	return entry;
++}
++
++static void del_sk_rule(struct vecls_sk_rule *rule)
++{
++	struct vecls_sk_entry *entry;
++
++	entry = get_sk_entry(rule->sk);
++	if (!entry)
++		return;
++	hlist_del_init(&entry->node);
++	kfree(entry);
++
++	vecls_debug("del rule=%p\n", rule);
++	hlist_del_init(&rule->node);
++	kfree(rule);
++}
++
++static struct vecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport)
++{
++	struct hlist_head *hlist = get_rule_hashlist(dip4, dport);
++	struct vecls_sk_rule *rule = NULL;
++
++	hlist_for_each_entry(rule, hlist, node) {
++		if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport)
++			break;
++	}
++	return rule;
++}
++
++static struct vecls_sk_rule *get_rule_from_sk(int devid, void *sk)
++{
++	struct vecls_sk_rule *rule = NULL;
++	struct vecls_sk_entry *entry;
++	struct hlist_head *hlist;
++
++	entry = get_sk_entry(sk);
++	if (!entry)
++		return NULL;
++
++	hlist = vecls_sk_rules.hash + (entry->sk_rule_hash & VECLS_SK_RULE_HASHMASK);
++	hlist_for_each_entry(rule, hlist, node) {
++		if (rule->devid == devid && rule->sk == sk)
++			break;
++	}
++	return rule;
++}
++
++static inline bool reuseport_check(int devid, u32 dip4, u16 dport)
++{
++	return !!get_sk_rule(devid, dip4, dport);
++}
++
++static u32 get_first_ip4_addr(struct net *net)
++{
++	struct in_device *in_dev;
++	struct net_device *dev;
++	struct in_ifaddr *ifa;
++	u32 dip4 = 0;
++
++	rtnl_lock();
++	rcu_read_lock();
++	for_each_netdev(net, dev) {
++		if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP))
++			continue;
++		in_dev = __in_dev_get_rcu(dev);
++		if (!in_dev)
++			continue;
++
++		in_dev_for_each_ifa_rcu(ifa, in_dev) {
++			if (!strcmp(dev->name, ifa->ifa_label)) {
++				dip4 = ifa->ifa_local;
++				vecls_debug("dev: %s, dip4:%pI4\n", dev->name, &dip4);
++				goto out;
++			}
++		}
++	}
++out:
++	rcu_read_unlock();
++	rtnl_unlock();
++	return dip4;
++}
++
++static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport)
++{
++	*dport = htons(sk->sk_num);
++
++	if (!match_ip_flag) {
++		*dip4 = 0;
++		return;
++	}
++
++	if (sk->sk_rcv_saddr)
++		*dip4 = sk->sk_rcv_saddr;
++	else
++		*dip4 = get_first_ip4_addr(sock_net(sk));
++}
++
++static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc)
++{
++	struct ethtool_rxnfc nfccmd;
++	int err;
++
++	nfccmd.cmd = ETHTOOL_SRXCLSRLDEL;
++	nfccmd.fs.location = loc;
++	err = send_ethtool_ioctl(ctx, &nfccmd);
++	if (err < 0)
++		vecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc);
++	return err;
++}
++
++static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc)
++{
++	if (loc >= rmgr->size) {
++		vecls_error("rmgr: Location out of range\n");
++		return -1;
++	}
++
++	set_bit(loc, rmgr->slot);
++	return 0;
++}
++
++static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp)
++{
++	__u32 loc, slot_num;
++
++	if (rmgr->driver_select)
++		return 0;
++
++	loc = rmgr->size - 1;
++	slot_num = loc / BITS_PER_LONG;
++	if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) {
++		loc -= 1 + (loc % BITS_PER_LONG);
++		slot_num--;
++	}
++
++	while (loc < rmgr->size && !~(rmgr->slot[slot_num])) {
++		loc -= BITS_PER_LONG;
++		slot_num--;
++	}
++
++	while (loc < rmgr->size && test_bit(loc, rmgr->slot))
++		loc--;
++
++	if (loc < rmgr->size) {
++		fsp->location = loc;
++		return rmgr_ins(rmgr, loc);
++	}
++
++	return -1;
++}
++
++static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select)
++{
++	struct ethtool_rxnfc nfccmd;
++	int err;
++
++	nfccmd.cmd = ETHTOOL_GRXCLSRLCNT;
++	nfccmd.data = 0;
++	err = send_ethtool_ioctl(ctx, &nfccmd);
++	*count = nfccmd.rule_cnt;
++	if (driver_select)
++		*driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL);
++	if (err < 0)
++		vecls_debug("rxclass: Cannot get RX class rule count\n");
++
++	return err;
++}
++
++static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr)
++{
++	struct ethtool_rxnfc *nfccmd;
++	__u32 *rule_locs;
++	int i, err = 0;
++
++	memset(rmgr, 0, sizeof(*rmgr));
++	err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select);
++	if (err < 0)
++		return err;
++
++	if (rmgr->driver_select)
++		return err;
++
++	nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC);
++	if (!nfccmd) {
++		vecls_error("rmgr: Cannot allocate memory for RX class rule locations\n");
++		err = -ENOMEM;
++		goto out;
++	}
++
++	nfccmd->cmd = ETHTOOL_GRXCLSRLALL;
++	nfccmd->rule_cnt = rmgr->n_rules;
++	err = send_ethtool_ioctl(ctx, nfccmd);
++	if (err < 0) {
++		vecls_debug("rmgr: Cannot get RX class rules\n");
++		goto out;
++	}
++
++	rmgr->size = nfccmd->data;
++	if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) {
++		vecls_error("rmgr: Invalid RX class rules table size\n");
++		err = -EINVAL;
++		goto out;
++	}
++
++	rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC);
++	if (!rmgr->slot) {
++		vecls_error("rmgr: Cannot allocate memory for RX class rules\n");
++		err = -ENOMEM;
++		goto out;
++	}
++
++	rule_locs = nfccmd->rule_locs;
++	for (i = 0; i < rmgr->n_rules; i++) {
++		err = rmgr_ins(rmgr, rule_locs[i]);
++		if (err < 0)
++			break;
++	}
++
++out:
++	kfree(nfccmd);
++	return err;
++}
++
++static void rmgr_cleanup(struct rmgr_ctrl *rmgr)
++{
++	kfree(rmgr->slot);
++	rmgr->slot = NULL;
++	rmgr->size = 0;
++}
++
++static int rmgr_set_location(struct cmd_context *ctx,
++			     struct ethtool_rx_flow_spec *fsp)
++{
++	struct rmgr_ctrl rmgr;
++	int ret;
++
++	ret = rmgr_init(ctx, &rmgr);
++	if (ret < 0)
++		goto out;
++
++	ret = rmgr_find_empty_slot(&rmgr, fsp);
++out:
++	rmgr_cleanup(&rmgr);
++	return ret;
++}
++
++static int rxclass_rule_ins(struct cmd_context *ctx,
++			    struct ethtool_rx_flow_spec *fsp, u32 rss_context)
++{
++	struct ethtool_rxnfc nfccmd;
++	u32 loc = fsp->location;
++	int ret;
++
++	if (loc & RX_CLS_LOC_SPECIAL) {
++		ret = rmgr_set_location(ctx, fsp);
++		if (ret < 0)
++			return ret;
++	}
++
++	nfccmd.cmd = ETHTOOL_SRXCLSRLINS;
++	nfccmd.rss_context = rss_context;
++	nfccmd.fs = *fsp;
++	ret = send_ethtool_ioctl(ctx, &nfccmd);
++	if (ret < 0) {
++		vecls_debug("Can not insert the clasification rule\n");
++		return ret;
++	}
++
++	if (loc & RX_CLS_LOC_SPECIAL)
++		vecls_debug("Added rule with ID %d\n", nfccmd.fs.location);
++
++	return 0;
++}
++
++static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp,
++				struct ethtool_rx_ntuple_flow_spec *ntuple)
++{
++	int i;
++
++	memset(ntuple, ~0, sizeof(*ntuple));
++	ntuple->flow_type = fsp->flow_type;
++	ntuple->action = fsp->ring_cookie;
++	memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u));
++	memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u));
++	for (i = 0; i < sizeof(ntuple->m_u); i++)
++		ntuple->m_u.hdata[i] ^= 0xFF;
++	ntuple->flow_type &= ~FLOW_EXT;
++}
++
++static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp)
++{
++	struct ethtool_rx_ntuple ntuplecmd;
++	struct ethtool_value eval;
++	int ret = 0;
++
++	flow_spec_to_ntuple(fsp, &ntuplecmd.fs);
++
++	eval.cmd = ETHTOOL_GFLAGS;
++	ret = send_ethtool_ioctl(ctx, &eval);
++	if (ret || !(eval.data & ETH_FLAG_NTUPLE))
++		return -1;
++
++	ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
++	ret = send_ethtool_ioctl(ctx, &ntuplecmd);
++	if (ret)
++		vecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret);
++
++	return ret;
++}
++
++static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del)
++{
++	struct ethtool_rx_flow_spec *fsp, rx_rule_fs;
++	u32 rss_context = 0;
++	int ret;
++
++	vecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n",
++		    is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid,
++		    ctx->del_ruleid);
++
++	if (is_del)
++		return rxclass_rule_del(ctx, ctx->del_ruleid);
++
++	ctx->ret_loc = -1;
++
++	fsp = &rx_rule_fs;
++	memset(fsp, 0, sizeof(*fsp));
++	fsp->flow_type = TCP_V4_FLOW;
++	fsp->location = RX_CLS_LOC_ANY;
++	fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4;
++	fsp->h_u.tcp_ip4_spec.pdst = ctx->dport;
++	if (ctx->dip4)
++		fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL;
++	fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL;
++	if (ctx->ruleid)
++		fsp->location = ctx->ruleid;
++	fsp->ring_cookie = ctx->action;
++
++	ret = do_srxntuple(ctx, &rx_rule_fs);
++	if (!ret)
++		return 0;
++
++	ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context);
++	if (!ret)
++		ctx->ret_loc = rx_rule_fs.location;
++	return ret;
++}
++
++static void cfg_work(struct work_struct *work)
++{
++	struct cfg_param *ctx_p = container_of(work, struct cfg_param, work);
++	struct vecls_netdev_info *vecls_dev;
++	struct vecls_sk_rule *rule;
++	int devid, rxq_id, err;
++
++	mutex_lock(&vecls_sk_rules.mutex);
++	for_each_vecls_netdev(devid, vecls_dev) {
++		strncpy(ctx_p->ctx.netdev, vecls_dev->dev_name, IFNAMSIZ);
++		if (ctx_p->is_del == false) {
++			if (reuseport_check(devid, ctx_p->ctx.dip4, ctx_p->ctx.dport)) {
++				vecls_debug("dip4:%pI4, dport:%d reuse!\n", &(ctx_p->ctx.dip4), ntohs(ctx_p->ctx.dport));
++				continue;
++			}
++
++			// Calculate the bound queue
++			rxq_id = alloc_rxq_id(ctx_p->nid, ctx_p->cpu, devid);
++			if (rxq_id < 0)
++				continue;
++
++			// Config Ntuple rule to dev
++			ctx_p->ctx.action = (u16)rxq_id;
++			err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del);
++			if (err) {
++				vecls_debug("Add sk:%p, dev_id:%d, rxq:%d, err:%d\n", ctx_p->sk, devid, rxq_id, err);
++				free_rxq_id(ctx_p->nid, devid, rxq_id);
++				continue;
++			}
++			add_sk_rule(devid, ctx_p->ctx.dip4, ctx_p->ctx.dport,
++				    ctx_p->sk, ctx_p->ctx.action, ctx_p->ctx.ret_loc, ctx_p->nid);
++		} else {
++			rule = get_rule_from_sk(devid, ctx_p->sk);
++			if (!rule) {
++				vecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n",
++					    ctx_p->sk, devid, &ctx_p->ctx.dip4, ntohs(ctx_p->ctx.dport));
++				continue;
++			}
++
++			// Config Ntuple rule to dev
++			ctx_p->ctx.del_ruleid = rule->ruleid;
++			err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del);
++			// Free the bound queue
++			free_rxq_id(rule->nid, devid, rule->action);
++			// Delete sk rule
++			del_sk_rule(rule);
++		}
++	}
++	mutex_unlock(&vecls_sk_rules.mutex);
++	kfree(ctx_p);
++	atomic_dec(&vecls_worker_count);
++}
++
++static bool has_sock_rule(struct sock *sk)
++{
++	struct vecls_netdev_info *vecls_dev;
++	struct vecls_sk_rule *rule;
++	int devid;
++
++	for_each_vecls_netdev(devid, vecls_dev) {
++		rule = get_rule_from_sk(devid, sk);
++		if (rule)
++			return true;
++	}
++	return false;
++}
++
++static void del_ntuple_rule(struct sock *sk)
++{
++	struct cfg_param *ctx_p;
++
++	if (!has_sock_rule(sk))
++		return;
++
++	ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC);
++	if (!ctx_p)
++		return;
++	get_sk_rule_addr(sk, &(ctx_p->ctx.dip4), &(ctx_p->ctx.dport));
++
++	ctx_p->is_del = true;
++	ctx_p->sk = sk;
++	INIT_WORK(&ctx_p->work, cfg_work);
++	queue_work(do_cfg_workqueue, &ctx_p->work);
++	atomic_inc(&vecls_worker_count);
++}
++
++static void add_ntuple_rule(struct sock *sk)
++{
++	struct cfg_param *ctx_p;
++	int cpu = raw_smp_processor_id();
++	int nid = cpu_to_node(cpu);
++
++	if (check_appname(current->comm))
++		return;
++
++	ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC);
++	if (!ctx_p)
++		return;
++	get_sk_rule_addr(sk, &(ctx_p->ctx.dip4), &(ctx_p->ctx.dport));
++
++	ctx_p->is_del = false;
++	ctx_p->sk = sk;
++	ctx_p->nid = nid;
++	ctx_p->cpu = cpu;
++	INIT_WORK(&ctx_p->work, cfg_work);
++	queue_work(do_cfg_workqueue, &ctx_p->work);
++	atomic_inc(&vecls_worker_count);
++}
++
++static void ethtool_cfg_rxcls(struct sock *sk, int is_del)
++{
++	if (sk->sk_state != TCP_LISTEN)
++		return;
++
++	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
++		return;
++
++	vecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:%pI4, port:%d\n", raw_smp_processor_id(),
++		    current->comm, sk, is_del, &sk->sk_rcv_saddr, (u16)sk->sk_num);
++
++	if (is_del)
++		del_ntuple_rule(sk);
++	else
++		add_ntuple_rule(sk);
++}
++
++static void clean_vecls_sk_rules(void)
++{
++	struct vecls_netdev_info *vecls_dev;
++	struct cmd_context ctx = { 0 };
++	struct vecls_sk_rule *rule;
++	struct hlist_head *hlist;
++	struct hlist_node *n;
++	unsigned int i;
++	int err;
++
++	mutex_lock(&vecls_sk_rules.mutex);
++	for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) {
++		hlist = &vecls_sk_rules.hash[i];
++
++		hlist_for_each_entry_safe(rule, n, hlist, node) {
++			vecls_dev = get_vecls_netdev_info(rule->devid);
++			if (!vecls_dev)
++				continue;
++			strncpy(ctx.netdev, vecls_dev->dev_name, IFNAMSIZ);
++			ctx.del_ruleid = rule->ruleid;
++			err = cfg_ethtool_rule(&ctx, true);
++			vecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk,
++				    rule->devid, rule->action, rule->ruleid, err);
++
++			hlist_del(&rule->node);
++			vecls_debug("clean rule=%p\n", rule);
++			kfree(rule);
++		}
++	}
++	mutex_unlock(&vecls_sk_rules.mutex);
++}
++
++int venetcls_ntuple_status(struct seq_file *seq, void *v)
++{
++	struct vecls_netdev_info *vecls_dev;
++	struct vecls_sk_rule *rule;
++	struct hlist_head *hlist;
++	struct hlist_node *n;
++	unsigned int i;
++
++	seq_printf(seq, "%-16s %-12s %-8s %-6s %-6s %-6s\n",
++		   "Interface", "dstIP", "dstPort", "rxq", "ruleId", "NumaID");
++	mutex_lock(&vecls_sk_rules.mutex);
++	for (i = 0; i < VECLS_SK_RULE_HASHSIZE; i++) {
++		hlist = &vecls_sk_rules.hash[i];
++		hlist_for_each_entry_safe(rule, n, hlist, node) {
++			vecls_dev = get_vecls_netdev_info(rule->devid);
++			if (!vecls_dev)
++				continue;
++			seq_printf(seq, "%-16s %-12pI4 %-8d %-6d %-6d %-6d\n",
++				   vecls_dev->dev_name, &rule->dip4, ntohs(rule->dport),
++				   rule->action, rule->ruleid, rule->nid);
++		}
++	}
++	mutex_unlock(&vecls_sk_rules.mutex);
++
++	return 0;
++}
++
++static const struct vecls_hook_ops vecls_ntuple_ops = {
++	.vecls_flow_update = NULL,
++	.vecls_set_cpu = NULL,
++	.vecls_timeout = NULL,
++	.vecls_cfg_rxcls = ethtool_cfg_rxcls,
++};
++
++int vecls_ntuple_res_init(void)
++{
++	do_cfg_workqueue = alloc_ordered_workqueue("vecls_cfg", 0);
++	if (!do_cfg_workqueue) {
++		vecls_debug("alloc_ordered_workqueue fails\n");
++		return -ENOMEM;
++	}
++
++	init_vecls_sk_rules();
++	RCU_INIT_POINTER(vecls_ops, &vecls_ntuple_ops);
++	synchronize_rcu();
++	return 0;
++}
++
++void vecls_ntuple_res_clean(void)
++{
++	RCU_INIT_POINTER(vecls_ops, NULL);
++	synchronize_rcu();
++
++	while (atomic_read(&vecls_worker_count) != 0)
++		mdelay(1);
++	destroy_workqueue(do_cfg_workqueue);
++	clean_vecls_sk_rules();
++}
+-- 
+2.20.1
+
diff --git a/0006-block-support-to-dispatch-bio-asynchronously.patch b/0006-block-support-to-dispatch-bio-asynchronously.patch
new file mode 100644
index 0000000000000000000000000000000000000000..4bb3791c8d066080f32ab562fa7603258ea286f2
--- /dev/null
+++ b/0006-block-support-to-dispatch-bio-asynchronously.patch
@@ -0,0 +1,513 @@
+From cafa19382531ab95d76e369712b0f7457d383597 Mon Sep 17 00:00:00 2001
+From: Li Nan <linan122@huawei.com>
+Date: Fri, 14 Jun 2024 11:44:06 +0800
+Subject: [PATCH] block: support to dispatch bio asynchronously
+
+In certain environments, specific CPUs handle a large number of tasks
+and become bottlenecks, affecting overall system performance. This
+commit introduces a new feature that enables asynchronous I/O dispatch
+to designated CPUs, thereby relieving the pressure on the busy CPUs.
+
+Signed-off-by: Li Nan <linan122@huawei.com>
+Signed-off-by: Zizhi Wo <wozizhi@huaweicloud.com>
+---
+ block/Kconfig             |  11 ++
+ block/blk-core.c          | 242 +++++++++++++++++++++++++++++++++++++-
+ block/blk-mq-debugfs.c    |   1 +
+ block/blk-sysfs.c         |  60 ++++++++++
+ block/blk.h               |   8 ++
+ config.aarch64            |   1 +
+ config.x86_64             |   1 +
+ include/linux/blk_types.h |   1 +
+ include/linux/blkdev.h    |   7 ++
+ 9 files changed, 331 insertions(+), 1 deletion(-)
+
+diff --git a/block/Kconfig b/block/Kconfig
+index c6ce41a5e..665a09a0a 100644
+--- a/block/Kconfig
++++ b/block/Kconfig
+@@ -190,6 +190,17 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
+ 	  by falling back to the kernel crypto API when inline
+ 	  encryption hardware is not present.
+ 
++config BLK_BIO_DISPATCH_ASYNC
++	bool "Dispatch bios asynchronously on specific cpus"
++	default n
++	help
++	In certain environments, specific CPUs handle a large number of
++	tasks and become bottlenecks, affecting overall system
++	performance. This commit introduces a new feature that enables
++	asynchronous I/O dispatch to designated CPUs, thereby relieving
++	the pressure on the busy CPUs.
++	If unsure, say N.
++
+ source "block/partitions/Kconfig"
+ 
+ config BLOCK_COMPAT
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 46a7049b8..3ce2baf7e 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -74,6 +74,236 @@ struct kmem_cache *blk_requestq_cachep;
+  */
+ static struct workqueue_struct *kblockd_workqueue;
+ 
++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC
++
++#define BIO_DISPATCH_MAX_LOOP 16
++
++struct async_bio {
++	struct bio_list list;
++	spinlock_t lock;
++} ____cacheline_aligned_in_smp;
++
++struct bio_dispatch_async_ctl {
++	/*
++	 * Vector size is nr_cpu_ids, list stores bio dispatched from other cpu,
++	 * such bio will be dispatched asynchronously to the cpu this structure
++	 * is serviced.
++	 */
++	struct async_bio	*bios;
++	/* kthread to handle bio dispatched from other cpu. */
++	struct task_struct	*thread;
++	wait_queue_head_t	wait;
++};
++
++static struct bio_dispatch_async_ctl __percpu *bio_dispatch_async_ctl;
++
++static int blk_alloc_queue_dispatch_async(struct request_queue *q)
++{
++	int cpu;
++
++	/* use the same function and parameters as alloc_cpumask_var() */
++	q->dispatch_async_cpus = kmalloc_node(cpumask_size(),
++					      GFP_KERNEL, q->node);
++	if (!q->dispatch_async_cpus)
++		return -ENOMEM;
++
++	q->last_dispatch_cpu = alloc_percpu(int);
++	if (!q->last_dispatch_cpu) {
++		kfree(q->dispatch_async_cpus);
++		q->dispatch_async_cpus = NULL;
++		return -ENOMEM;
++	}
++
++	cpumask_setall(q->dispatch_async_cpus);
++	for_each_possible_cpu(cpu)
++		*per_cpu_ptr(q->last_dispatch_cpu, cpu) = cpu;
++
++	return 0;
++}
++
++void blk_free_queue_dispatch_async(struct request_queue *q)
++{
++	kfree(q->dispatch_async_cpus);
++	q->dispatch_async_cpus = NULL;
++	free_percpu(q->last_dispatch_cpu);
++	q->last_dispatch_cpu = NULL;
++}
++
++static int get_dispatch_cpu(struct request_queue *q)
++{
++	int cpu = cpumask_next(this_cpu_read(*q->last_dispatch_cpu),
++			       q->dispatch_async_cpus);
++
++	if (cpu >= nr_cpu_ids)
++		cpu = cpumask_first(q->dispatch_async_cpus);
++
++	return cpu;
++}
++
++static bool __submit_bio_noacct_async(struct bio *bio)
++{
++	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
++	int current_cpu = smp_processor_id();
++	int dispatch_cpu = get_dispatch_cpu(q);
++	struct bio_dispatch_async_ctl *ctl;
++
++	if (dispatch_cpu >= nr_cpu_ids)
++		return false;
++
++	this_cpu_write(*q->last_dispatch_cpu, dispatch_cpu);
++
++	ctl = per_cpu_ptr(bio_dispatch_async_ctl, dispatch_cpu);
++	spin_lock_irq(&ctl->bios[current_cpu].lock);
++	bio_list_add(&ctl->bios[current_cpu].list, bio);
++	spin_unlock_irq(&ctl->bios[current_cpu].lock);
++
++	if (wq_has_sleeper(&ctl->wait))
++		wake_up(&ctl->wait);
++
++	return true;
++}
++
++static bool submit_bio_noacct_async(struct bio *bio)
++{
++	struct request_queue *q;
++
++	if (bio_flagged(bio, BIO_ASYNC))
++		return false;
++
++	bio_set_flag(bio, BIO_ASYNC);
++	/*
++	 * Don't dispatch bio asynchronously in following cases:
++	 *
++	 * - QUEUE_FLAG_DISPATCH_ASYNC is not set;
++	 * - io polling is enabled;
++	 * - current cpu is the target cpu;
++	 * - bio is flagged no wait;
++	 */
++	q = bio->bi_bdev->bd_disk->queue;
++	if (!test_bit(QUEUE_FLAG_DISPATCH_ASYNC, &q->queue_flags) ||
++	    test_bit(QUEUE_FLAG_POLL, &q->queue_flags) ||
++	    cpumask_test_cpu(smp_processor_id(), q->dispatch_async_cpus) ||
++	    bio->bi_opf & REQ_NOWAIT)
++		return false;
++
++	return __submit_bio_noacct_async(bio);
++}
++
++static bool collect_bio(struct bio_dispatch_async_ctl *ctl,
++			struct bio_list *list)
++{
++	bool has_bio = false;
++	int cpu;
++
++	for_each_possible_cpu(cpu) {
++		struct async_bio *abio = &ctl->bios[cpu];
++
++		if (bio_list_empty(&abio->list))
++			continue;
++
++		has_bio = true;
++
++		spin_lock_irq(&abio->lock);
++		bio_list_merge(list, &abio->list);
++		bio_list_init(&abio->list);
++		spin_unlock_irq(&abio->lock);
++	}
++
++	return has_bio;
++}
++
++static int bio_dispatch_work(void *data)
++{
++	int loop_count = 0;
++	struct bio_list bio_list_on_stack;
++	struct blk_plug plug;
++	struct bio_dispatch_async_ctl *ctl;
++
++	bio_list_init(&bio_list_on_stack);
++	ctl = this_cpu_ptr(bio_dispatch_async_ctl);
++
++	for (;; loop_count++) {
++		struct bio *bio;
++		bool has_bio = collect_bio(ctl, &bio_list_on_stack);
++
++		if (!has_bio) {
++			DEFINE_WAIT(wait);
++
++			for (;;) {
++				prepare_to_wait(&ctl->wait, &wait,
++						TASK_INTERRUPTIBLE);
++				has_bio = collect_bio(ctl, &bio_list_on_stack);
++				if (has_bio)
++					break;
++				schedule();
++				loop_count = 0;
++			}
++			finish_wait(&ctl->wait, &wait);
++		}
++
++		blk_start_plug(&plug);
++		while ((bio = bio_list_pop(&bio_list_on_stack)))
++			submit_bio_noacct(bio);
++		blk_finish_plug(&plug);
++
++		/* prevent soft lockup. */
++		if (loop_count >= BIO_DISPATCH_MAX_LOOP) {
++			loop_count = 0;
++			cond_resched();
++		}
++	}
++
++	return 0;
++}
++
++static void init_blk_queue_async_dispatch(void)
++{
++	int cpu;
++
++	bio_dispatch_async_ctl = alloc_percpu(struct bio_dispatch_async_ctl);
++	if (!bio_dispatch_async_ctl)
++		panic("Failed to alloc bio_dispatch_async_ctl\n");
++
++	for_each_possible_cpu(cpu) {
++		int i;
++		struct bio_dispatch_async_ctl *ctl =
++			per_cpu_ptr(bio_dispatch_async_ctl, cpu);
++
++		init_waitqueue_head(&ctl->wait);
++		ctl->bios = kmalloc_array(nr_cpu_ids, sizeof(struct async_bio),
++					  GFP_KERNEL);
++		if (!ctl->bios)
++			panic("Failed to alloc async bio array\n");
++		for (i = 0; i < nr_cpu_ids; ++i) {
++			bio_list_init(&ctl->bios[i].list);
++			spin_lock_init(&ctl->bios[i].lock);
++		}
++
++		ctl->thread =
++			kthread_create_on_cpu(bio_dispatch_work, NULL, cpu,
++					      "bio_dispatch_work_%u");
++		if (IS_ERR_OR_NULL(ctl->thread))
++			panic("Failed to create bio dispatch thread\n");
++
++		wake_up_process(ctl->thread);
++	}
++}
++#else
++static int blk_alloc_queue_dispatch_async(struct request_queue *q)
++{
++	return 0;
++}
++
++static bool submit_bio_noacct_async(struct bio *bio)
++{
++	return false;
++}
++
++static void init_blk_queue_async_dispatch(void)
++{
++}
++#endif
++
+ /**
+  * blk_queue_flag_set - atomically set a queue flag
+  * @flag: flag to be set
+@@ -499,9 +729,12 @@ struct request_queue *blk_alloc_queue(int node_id)
+ 
+ 	q->last_merge = NULL;
+ 
++	if (blk_alloc_queue_dispatch_async(q))
++		goto fail_q;
++
+ 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ 	if (q->id < 0)
+-		goto fail_q;
++		goto fail_dispatch_async;
+ 
+ 	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
+ 	if (ret)
+@@ -553,6 +786,8 @@ struct request_queue *blk_alloc_queue(int node_id)
+ 	bioset_exit(&q->bio_split);
+ fail_id:
+ 	ida_simple_remove(&blk_queue_ida, q->id);
++fail_dispatch_async:
++	blk_free_queue_dispatch_async(q);
+ fail_q:
+ 	kmem_cache_free(blk_requestq_cachep, q);
+ 	return NULL;
+@@ -963,6 +1198,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
+  */
+ void submit_bio_noacct(struct bio *bio)
+ {
++	if (submit_bio_noacct_async(bio))
++		return;
++
+ 	/*
+ 	 * We only want one ->submit_bio to be active at a time, else stack
+ 	 * usage with stacked devices could be a problem.  Use current->bio_list
+@@ -1688,5 +1926,7 @@ int __init blk_dev_init(void)
+ 
+ 	blk_debugfs_root = debugfs_create_dir("block", NULL);
+ 
++	init_blk_queue_async_dispatch();
++
+ 	return 0;
+ }
+diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
+index 4866d4f81..67957ce49 100644
+--- a/block/blk-mq-debugfs.c
++++ b/block/blk-mq-debugfs.c
+@@ -131,6 +131,7 @@ static const char *const blk_queue_flag_name[] = {
+ 	QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
+ 	QUEUE_FLAG_NAME(HCTX_ACTIVE),
+ 	QUEUE_FLAG_NAME(NOWAIT),
++	QUEUE_FLAG_NAME(DISPATCH_ASYNC),
+ };
+ #undef QUEUE_FLAG_NAME
+ 
+diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
+index 725530f13..e0ef894b9 100644
+--- a/block/blk-sysfs.c
++++ b/block/blk-sysfs.c
+@@ -304,6 +304,9 @@ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
+ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
+ QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC
++QUEUE_SYSFS_BIT_FNS(dispatch_async, DISPATCH_ASYNC, 0);
++#endif
+ #undef QUEUE_SYSFS_BIT_FNS
+ 
+ static ssize_t queue_zoned_show(struct request_queue *q, char *page)
+@@ -625,6 +628,57 @@ QUEUE_RW_ENTRY(queue_iostats, "iostats");
+ QUEUE_RW_ENTRY(queue_random, "add_random");
+ QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+ 
++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC
++
++static ssize_t queue_dispatch_async_cpus_show(struct request_queue *q,
++					      char *page)
++{
++	return sprintf(page, "%*pb\n", nr_cpu_ids,
++		       cpumask_bits(q->dispatch_async_cpus));
++}
++
++static ssize_t queue_dispatch_async_cpus_store(struct request_queue *q,
++					       const char *page, size_t count)
++{
++	cpumask_var_t cpumask;
++	ssize_t ret;
++
++	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
++		return -ENOMEM;
++
++	ret = bitmap_parse(page, count, cpumask_bits(cpumask),
++			   nr_cpumask_bits);
++	if (ret < 0)
++		goto out;
++
++	if (cpumask_empty(cpumask) ||
++	    !cpumask_subset(cpumask, cpu_online_mask)) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	blk_mq_freeze_queue(q);
++	blk_mq_quiesce_queue(q);
++
++	cpumask_copy(q->dispatch_async_cpus, cpumask);
++
++	blk_mq_unquiesce_queue(q);
++	blk_mq_unfreeze_queue(q);
++	ret = count;
++out:
++	free_cpumask_var(cpumask);
++	return ret;
++}
++
++static struct queue_sysfs_entry queue_dispatch_async_cpus_entry = {
++	.attr = {.name = "dispatch_async_cpus", .mode = 0644 },
++	.show = queue_dispatch_async_cpus_show,
++	.store = queue_dispatch_async_cpus_store,
++};
++
++QUEUE_RW_ENTRY(queue_dispatch_async, "dispatch_async");
++#endif
++
+ static struct attribute *queue_attrs[] = {
+ 	&queue_requests_entry.attr,
+ 	&queue_ra_entry.attr,
+@@ -666,6 +720,10 @@ static struct attribute *queue_attrs[] = {
+ 	&queue_wb_lat_entry.attr,
+ 	&queue_poll_delay_entry.attr,
+ 	&queue_io_timeout_entry.attr,
++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC
++	&queue_dispatch_async_cpus_entry.attr,
++	&queue_dispatch_async_entry.attr,
++#endif
+ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ 	&blk_throtl_sample_time_entry.attr,
+ #endif
+@@ -773,6 +831,8 @@ static void blk_release_queue(struct kobject *kobj)
+ 		blk_stat_remove_callback(q, q->poll_cb);
+ 	blk_stat_free_callback(q->poll_cb);
+ 
++	blk_free_queue_dispatch_async(q);
++
+ 	blk_free_queue_stats(q->stats);
+ 
+ 	blk_queue_free_zone_bitmaps(q);
+diff --git a/block/blk.h b/block/blk.h
+index e80350327..6a7bad9f8 100644
+--- a/block/blk.h
++++ b/block/blk.h
+@@ -454,4 +454,12 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+ 
+ extern const struct address_space_operations def_blk_aops;
+ 
++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC
++void blk_free_queue_dispatch_async(struct request_queue *q);
++#else
++static inline void blk_free_queue_dispatch_async(struct request_queue *q)
++{
++}
++#endif
++
+ #endif /* BLK_INTERNAL_H */
+diff --git a/config.aarch64 b/config.aarch64
+index 998d7fb5b..234d15966 100644
+--- a/config.aarch64
++++ b/config.aarch64
+@@ -818,6 +818,7 @@ CONFIG_BLK_DEBUG_FS=y
+ CONFIG_BLK_DEBUG_FS_ZONED=y
+ CONFIG_BLK_SED_OPAL=y
+ # CONFIG_BLK_INLINE_ENCRYPTION is not set
++CONFIG_BLK_BIO_DISPATCH_ASYNC=y
+ 
+ #
+ # Partition Types
+diff --git a/config.x86_64 b/config.x86_64
+index 8d9500329..e5908b4a5 100644
+--- a/config.x86_64
++++ b/config.x86_64
+@@ -861,6 +861,7 @@ CONFIG_BLK_DEBUG_FS=y
+ CONFIG_BLK_DEBUG_FS_ZONED=y
+ CONFIG_BLK_SED_OPAL=y
+ # CONFIG_BLK_INLINE_ENCRYPTION is not set
++CONFIG_BLK_BIO_DISPATCH_ASYNC=y
+ 
+ #
+ # Partition Types
+diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
+index fa78cbf26..ba37b97cb 100644
+--- a/include/linux/blk_types.h
++++ b/include/linux/blk_types.h
+@@ -303,6 +303,7 @@ enum {
+ 	BIO_REMAPPED,
+ 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+ 	BIO_PERCPU_CACHE,	/* can participate in per-cpu alloc cache */
++	BIO_ASYNC,		/* has been dispatched asynchronously */
+ 	BIO_FLAG_LAST
+ };
+ 
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index dfd4b9361..18dc3f950 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -337,6 +337,12 @@ struct request_queue {
+ 
+ 	bool			mq_sysfs_init_done;
+ 
++#ifdef CONFIG_BLK_BIO_DISPATCH_ASYNC
++	/* used when QUEUE_FLAG_DISPATCH_ASYNC is set */
++	struct cpumask		*dispatch_async_cpus;
++	int __percpu		*last_dispatch_cpu;
++#endif
++
+ #define BLK_MAX_WRITE_HINTS	5
+ 	u64			write_hints[BLK_MAX_WRITE_HINTS];
+ };
+@@ -372,6 +378,7 @@ struct request_queue {
+ #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
+ #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
+ #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
++#define QUEUE_FLAG_DISPATCH_ASYNC 30	/* support to dispatch bio asynchronously */
+ 
+ #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
+ 				 (1 << QUEUE_FLAG_SAME_COMP) |		\
+-- 
+2.20.1
+
diff --git a/0007-sched-fair-Prefer-physical-cores-when-migrating-task.patch b/0007-sched-fair-Prefer-physical-cores-when-migrating-task.patch
new file mode 100644
index 0000000000000000000000000000000000000000..946313c66f1291268e6f1c70c186a8850803badd
--- /dev/null
+++ b/0007-sched-fair-Prefer-physical-cores-when-migrating-task.patch
@@ -0,0 +1,218 @@
+From ebc4f8fa4841f245d81f83832532e2206af4f8fc Mon Sep 17 00:00:00 2001
+From: Cheng Yu <serein.chengyu@huawei.com>
+Date: Mon, 12 Aug 2024 20:40:25 +0800
+Subject: [PATCH] sched/fair: Prefer physical cores when migrating tasks
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAJEHU
+CVE: NA
+
+--------------------------------
+
+When cpu hyperthreading is enabled, one physical core can virtualize
+multiple logical cpus. Assume that physical core0 virtualizes two
+logical cpus, cpu0 and cpu1. Only when the load of cpu0 exceeds the set
+ratio to the capacity of cpu0, the task will be migrated to the cpu1,
+otherwise the task will not be migrated and the cpu0 will still be used.
+
+Signed-off-by: Cheng Yu <serein.chengyu@huawei.com>
+Signed-off-by: Liu Jian <liujian56@huawei.com>
+---
+ arch/arm64/Kconfig           |  1 +
+ config.aarch64               |  1 +
+ config.aarch64-64k           |  1 +
+ include/linux/sched/sysctl.h |  4 ++++
+ init/Kconfig                 | 18 ++++++++++++++++++
+ kernel/sched/fair.c          | 34 ++++++++++++++++++++++++++++++++++
+ kernel/sched/features.h      |  4 ++++
+ kernel/sysctl.c              | 12 ++++++++++++
+ 8 files changed, 75 insertions(+)
+
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index eef487d36..31eaa7775 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -90,6 +90,7 @@ config ARM64
+ 	select ARCH_SUPPORTS_ATOMIC_RMW
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
++	select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
+ 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ 	select ARCH_WANT_DEFAULT_BPF_JIT
+ 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+diff --git a/config.aarch64 b/config.aarch64
+index 234d15966..fcff88edb 100644
+--- a/config.aarch64
++++ b/config.aarch64
+@@ -170,6 +170,7 @@ CONFIG_IPC_NS=y
+ CONFIG_USER_NS=y
+ CONFIG_PID_NS=y
+ CONFIG_NET_NS=y
++CONFIG_SCHED_KEEP_ON_CORE=y
+ CONFIG_CHECKPOINT_RESTORE=y
+ CONFIG_SCHED_AUTOGROUP=y
+ # CONFIG_SYSFS_DEPRECATED is not set
+diff --git a/config.aarch64-64k b/config.aarch64-64k
+index 5cce0103e..41daa7820 100644
+--- a/config.aarch64-64k
++++ b/config.aarch64-64k
+@@ -170,6 +170,7 @@ CONFIG_IPC_NS=y
+ CONFIG_USER_NS=y
+ CONFIG_PID_NS=y
+ CONFIG_NET_NS=y
++CONFIG_SCHED_KEEP_ON_CORE=y
+ CONFIG_CHECKPOINT_RESTORE=y
+ CONFIG_SCHED_AUTOGROUP=y
+ # CONFIG_SYSFS_DEPRECATED is not set
+diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
+index 304f43117..da02869a4 100644
+--- a/include/linux/sched/sysctl.h
++++ b/include/linux/sched/sysctl.h
+@@ -28,6 +28,10 @@ enum { sysctl_hung_task_timeout_secs = 0 };
+ 
+ extern unsigned int sysctl_sched_child_runs_first;
+ 
++#ifdef CONFIG_SCHED_KEEP_ON_CORE
++extern int sysctl_sched_util_ratio;
++#endif
++
+ enum sched_tunable_scaling {
+ 	SCHED_TUNABLESCALING_NONE,
+ 	SCHED_TUNABLESCALING_LOG,
+diff --git a/init/Kconfig b/init/Kconfig
+index 52167947f..896773c70 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1282,6 +1282,24 @@ config NET_NS
+ 
+ endif # NAMESPACES
+ 
++# For architectures that want to enable the support for SCHED_KEEP_ON_CORE
++#
++config ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
++	bool
++
++config SCHED_KEEP_ON_CORE
++	bool "Prefer physical cores when migrating tasks"
++	depends on ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
++	depends on SCHED_SMT
++	default n
++	help
++	  When cpu hyperthreading is enabled, one physical core can virtualize
++	  multiple logical cpus. Assume that physical core0 virtualizes two
++	  logical cpus, cpu0 and cpu1. Only when the load of cpu0 exceeds the
++	  ratio to the capacity of cpu0, the task will be migrated to the cpu1,
++	  otherwise the task will not be migrated and the cpu0 will still be
++	  used.
++
+ config CHECKPOINT_RESTORE
+ 	bool "Checkpoint/restore support"
+ 	select PROC_CHILDREN
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index f4db97423..b9e3a63ad 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6487,6 +6487,22 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
+ 	return si_cpu;
+ }
+ 
++#ifdef CONFIG_SCHED_KEEP_ON_CORE
++int sysctl_sched_util_ratio = 100;
++
++static bool core_has_spare(int cpu)
++{
++	int core_id = cpumask_first(cpu_smt_mask(cpu));
++	unsigned long util = cpu_util(core_id);
++	unsigned long capacity = capacity_of(core_id);
++
++	if (sysctl_sched_util_ratio == 100)
++		return true;
++
++	return util * 100 < capacity * sysctl_sched_util_ratio;
++}
++#endif
++
+ #else /* CONFIG_SCHED_SMT */
+ 
+ static inline void set_idle_cores(int cpu, int val)
+@@ -7365,6 +7381,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		/* Fast path */
+ 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ 	}
++
++#ifdef CONFIG_SCHED_KEEP_ON_CORE
++	if (sched_feat(KEEP_ON_CORE) &&
++	    static_branch_likely(&sched_smt_present)) {
++		if (core_has_spare(new_cpu))
++			new_cpu = cpumask_first(cpu_smt_mask((new_cpu)));
++	}
++#endif
++
+ 	rcu_read_unlock();
+ 
+ 	return new_cpu;
+@@ -8187,6 +8212,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 
+ 	lockdep_assert_rq_held(env->src_rq);
+ 
++#ifdef CONFIG_SCHED_KEEP_ON_CORE
++	if (sched_feat(KEEP_ON_CORE) &&
++	    static_branch_likely(&sched_smt_present)) {
++		if (core_has_spare(env->dst_cpu) &&
++		    cpumask_first(cpu_smt_mask((env->dst_cpu))) != env->dst_cpu)
++			return 0;
++	}
++#endif
++
+ 	/*
+ 	 * We do not migrate tasks that are:
+ 	 * 1) throttled_lb_pair, or
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 8c1d34adc..9d0d29a12 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -58,6 +58,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
+ SCHED_FEAT(SIS_PROP, false)
+ SCHED_FEAT(SIS_UTIL, true)
+ 
++#ifdef CONFIG_SCHED_KEEP_ON_CORE
++SCHED_FEAT(KEEP_ON_CORE, false)
++#endif
++
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index 32e59f230..5299337f8 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -118,6 +118,7 @@ static int sixty = 60;
+ static unsigned long zero_ul;
+ static unsigned long one_ul = 1;
+ static unsigned long long_max = LONG_MAX;
++static int one_hundred = 100;
+ #ifdef CONFIG_PRINTK
+ static int ten_thousand = 10000;
+ #endif
+@@ -2746,6 +2747,17 @@ static struct ctl_table kern_table[] = {
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
++#ifdef CONFIG_SCHED_KEEP_ON_CORE
++	{
++		.procname       = "sched_util_ratio",
++		.data           = &sysctl_sched_util_ratio,
++		.maxlen         = sizeof(sysctl_sched_util_ratio),
++		.mode           = 0644,
++		.proc_handler   = proc_dointvec_minmax,
++		.extra1         = SYSCTL_ZERO,
++		.extra2		= &one_hundred,
++	},
++#endif
+ 	{ }
+ };
+ 
+-- 
+2.20.1
+