From bb73beff82b1533f7c7ccaf064b93be7646e6afc Mon Sep 17 00:00:00 2001
From: shaodenghui <shaodenghui@huawei.com>
Date: Wed, 28 Feb 2024 21:37:03 +0800
Subject: [PATCH 1/3] iscsi: add member for NUMA aware order workqueue

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZNO
CVE: NA

-------------------------------------------------

Add member to struct iscsi_conn.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-By: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: BiaoXiang Ye <yebiaoxiang@huawei.com>
Reviewed-by: fang yi <eric.fangyi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: shaodenghui <shaodenghui@huawei.com>
---
 include/linux/workqueue.h | 1 +
 include/scsi/libiscsi.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9619098755fb..485c0f5b2518 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -415,6 +415,7 @@ enum {
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
 	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
 	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
+	__WQ_DYNAMIC            = 1 << 25, /* internal: only support single work order WQ */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_UNBOUND_MAX_ACTIVE	= WQ_MAX_ACTIVE,
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index d253b82e973e..ead17d2224a1 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -266,6 +266,7 @@ struct iscsi_conn {
 	/* custom statistics */
 	uint32_t		eh_abort_cnt;
 	uint32_t		fmr_unalign_cnt;
+	int			intimate_cpu;
 
 	KABI_RESERVE(1)
 	KABI_RESERVE(2)
-- 
Gitee


From e61bd18f079637b19111fbeef6416ea517471450 Mon Sep 17 00:00:00 2001
From: shaodenghui <shaodenghui@huawei.com>
Date: Wed, 28 Feb 2024 21:37:04 +0800
Subject: [PATCH 2/3] workqueue: implement NUMA affinity for single thread
 workqueue

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZNO
CVE: NA

-------------------------------------------------

Currently, single thread workqueue only have single pwq, all of
works are queued the same workerpool. This is not optimal on
NUMA machines, will cause workers jump around across node.

This patch add a new wq flags __WQ_DYNAMIC,  this new kind of
single thread workqueue creates a separate pwq covering the
intersecting CPUS for each NUMA node which has online CPUS
in @attrs->cpumask instead of mapping all entries of numa_pwq_tbl[]
to the same pwq. After this, we can specify the @cpu of
queue_work_on, so the work can be executed on the same NUMA
node of the specified @cpu.
This kind of wq only support single work, multi works can't guarantee
the work's order.

Signed-off-by: Biaoxiang Ye <yebiaoxiang@huawei.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-by: fang yi <eric.fangyi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: shaodenghui <shaodenghui@huawei.com>
---
 kernel/workqueue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1888741f5edd..590703eb6e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4375,6 +4375,9 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 
 	/* save the user configured attrs and sanitize it. */
 	copy_workqueue_attrs(new_attrs, attrs);
+	if (wq->flags & __WQ_DYNAMIC)
+		new_attrs->ordered = false;
+
 	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
 	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
 	ctx->attrs = new_attrs;
@@ -4591,10 +4594,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	cpus_read_lock();
 	if (wq->flags & __WQ_ORDERED) {
 		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
-		/* there should only be single pwq for ordering guarantee */
-		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
-			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
-		     "ordering guarantee broken for workqueue %s\n", wq->name);
+		if (!(wq->flags & __WQ_DYNAMIC)) {
+                       /* there should only be single pwq for ordering guarantee */
+                       WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+                                       wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+                                       "ordering guarantee broken for workqueue %s\n", wq->name);
+               }
 	} else {
 		ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
 	}
@@ -5798,7 +5803,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
 			continue;
 
 		/* creating multiple pwqs breaks ordering guarantee */
-		if (!list_empty(&wq->pwqs)) {
+		if (!list_empty(&wq->pwqs) && !(wq->flags & __WQ_DYNAMIC)) {
 			if (wq->flags & __WQ_ORDERED_EXPLICIT)
 				continue;
 			wq->flags &= ~__WQ_ORDERED;
-- 
Gitee


From c3c4b0e5ab98ae4d0aa90d3e1a712cf0863c49c8 Mon Sep 17 00:00:00 2001
From: shaodenghui <shaodenghui@huawei.com>
Date: Wed, 28 Feb 2024 21:37:05 +0800
Subject: [PATCH 3/3] iscsi: use dynamic single thread workqueue to improve
 performance

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZNO
CVE: NA

-------------------------------------------------

On aarch64 NUMA machines, the kworker of iscsi created always jump
around across node boundaries. If it work on the different node even
different cpu package with the softirq of network interface, memcpy
with in iscsi_tcp_segment_recv will be slow down, and iscsi got an
terrible performance.

In this patch, we trace the cpu of softirq, and tell queue_work_on
to execute iscsi_xmitworker on the same NUMA node.

The performance data as below:
fio cmd:
fio -filename=/dev/disk/by-id/wwn-0x6883fd3100a2ad260036281700000000
-direct=1 -iodepth=32 -rw=read -bs=64k -size=30G -ioengine=libaio
-numjobs=1 -group_reporting -name=mytest -time_based -ramp_time=60
-runtime=60

before patch:
Jobs: 1 (f=1): [R] [52.5% done] [852.3MB/0KB/0KB /s] [13.7K/0/0 iops] [eta 00m:57s]
Jobs: 1 (f=1): [R] [53.3% done] [861.4MB/0KB/0KB /s] [13.8K/0/0 iops] [eta 00m:56s]
Jobs: 1 (f=1): [R] [54.2% done] [868.2MB/0KB/0KB /s] [13.9K/0/0 iops] [eta 00m:55s]

after pactch:
Jobs: 1 (f=1): [R] [53.3% done] [1070MB/0KB/0KB /s] [17.2K/0/0 iops] [eta 00m:56s]
Jobs: 1 (f=1): [R] [55.0% done] [1064MB/0KB/0KB /s] [17.3K/0/0 iops] [eta 00m:54s]
Jobs: 1 (f=1): [R] [56.7% done] [1069MB/0KB/0KB /s] [17.1K/0/0 iops] [eta 00m:52s]

cpu info:
Architecture:          aarch64
Byte Order:            Little Endian
CPU(s):                128
On-line CPU(s) list:   0-127
Thread(s) per core:    1
Core(s) per socket:    64
Socket(s):             2
NUMA node(s):          4
Model:                 0
CPU max MHz:           2600.0000
CPU min MHz:           200.0000
BogoMIPS:              200.00
L1d cache:             64K
L1i cache:             64K
L2 cache:              512K
L3 cache:              32768K
NUMA node0 CPU(s):     0-31
NUMA node1 CPU(s):     32-63
NUMA node2 CPU(s):     64-95
NUMA node3 CPU(s):     96-127

Signed-off-by: Biaoxiang Ye <yebiaoxiang@huawei.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-by: fang yi <eric.fangyi@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: shaodenghui <shaodenghui@huawei.com>
---
 drivers/scsi/iscsi_tcp.c |  9 +++++++++
 drivers/scsi/libiscsi.c  | 17 ++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 8e14cea15f98..54d01b9d330d 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -170,6 +170,7 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk)
 	struct iscsi_sw_tcp_conn *tcp_sw_conn;
 	struct iscsi_tcp_conn *tcp_conn;
 	struct iscsi_conn *conn;
+	int current_cpu;
 
 	trace_sk_data_ready(sk);
 
@@ -180,6 +181,14 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk)
 		return;
 	}
 	tcp_conn = conn->dd_data;
+
+        /* save intimate cpu when in softirq */
+        if (!sock_owned_by_user_nocheck(sk)) {
+                current_cpu = smp_processor_id();
+                if (conn->intimate_cpu != current_cpu)
+                        conn->intimate_cpu = current_cpu;
+        }
+
 	tcp_sw_conn = tcp_conn->dd_data;
 
 	if (tcp_sw_conn->queue_recv)
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 0fda8905eabd..2d585f80e122 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -89,9 +89,15 @@ inline void iscsi_conn_queue_xmit(struct iscsi_conn *conn)
 {
 	struct Scsi_Host *shost = conn->session->host;
 	struct iscsi_host *ihost = shost_priv(shost);
+	int intimate_cpu = conn->intimate_cpu;
 
-	if (ihost->workq)
-		queue_work(ihost->workq, &conn->xmitwork);
+	if (ihost->workq) {
+	        /* we expect it to be excuted on the same numa of the intimate cpu */
+	        if ((intimate_cpu >= 0) && cpu_possible(intimate_cpu))
+	                queue_work_on(intimate_cpu, ihost->workq, &conn->xmitwork);
+	        else
+	                queue_work(ihost->workq, &conn->xmitwork);
+	}
 }
 EXPORT_SYMBOL_GPL(iscsi_conn_queue_xmit);
 
@@ -2907,9 +2913,9 @@ struct Scsi_Host *iscsi_host_alloc(const struct scsi_host_template *sht,
 	ihost = shost_priv(shost);
 
 	if (xmit_can_sleep) {
-		ihost->workq = alloc_workqueue("iscsi_q_%d",
-			WQ_SYSFS | __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_UNBOUND,
-			1, shost->host_no);
+		/* this kind of workqueue only support single work */
+		ihost->workq = alloc_ordered_workqueue("iscsi_q_%d", __WQ_LEGACY | WQ_MEM_RECLAIM |
+				                        __WQ_DYNAMIC, shost->host_no);
 		if (!ihost->workq)
 			goto free_host;
 	}
@@ -3190,6 +3196,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
 	conn->c_stage = ISCSI_CONN_INITIAL_STAGE;
 	conn->id = conn_idx;
 	conn->exp_statsn = 0;
+	conn->intimate_cpu = -1;
 
 	timer_setup(&conn->transport_timer, iscsi_check_transport_timeouts, 0);
 
-- 
Gitee