From 7400907c1827b6eb010066efb6798ac354ad0281 Mon Sep 17 00:00:00 2001 From: Shao Denghui Date: Wed, 6 Mar 2024 16:30:56 +0800 Subject: [PATCH 1/3] workqueue: add member for NUMA aware order workqueue and implement NUMA affinity for single thread workqueue euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94XYA CVE: NA ------------------------------------------------- Currently, single thread workqueue only have single pwq, all of works are queued the same workerpool. This is not optimal on NUMA machines, will cause workers jump around across node. This patch add a new wq flags __WQ_DYNAMIC, this new kind of single thread workqueue creates a separate pwq covering the intersecting CPUS for each NUMA node which has online CPUS in @attrs->cpumask instead of mapping all entries of numa_pwq_tbl[] to the same pwq. After this, we can specify the @cpu of queue_work_on, so the work can be executed on the same NUMA node of the specified @cpu. This kind of wq only support single work, multi works can't guarantee the work's order. Signed-off-by: Biaoxiang Ye Signed-off-by: shaodenghui --- Kconfig | 2 ++ include/linux/workqueue.h | 1 + kernel/workqueue.c | 21 +++++++++++++++++++-- lib/Kconfig.openeuler | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 lib/Kconfig.openeuler diff --git a/Kconfig b/Kconfig index 745bc773f567..4a96e16e5f31 100644 --- a/Kconfig +++ b/Kconfig @@ -29,4 +29,6 @@ source "lib/Kconfig" source "lib/Kconfig.debug" +source "lib/Kconfig.openeuler" + source "Documentation/Kconfig" diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 9619098755fb..485c0f5b2518 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -415,6 +415,7 @@ enum { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ + __WQ_DYNAMIC = 1 << 25, /* internal: only support single work order WQ */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1888741f5edd..74431968a05c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4355,6 +4355,10 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, * it even if we don't use it immediately. */ copy_workqueue_attrs(new_attrs, attrs); +#ifdef KWORKER_NUMA_AFFINITY + if (wq->flags & __WQ_DYNAMIC) + new_attrs->ordered = false; +#endif wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); @@ -4591,10 +4595,19 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); +#ifdef KWORKER_NUMA_AFFINITY + if (!(wq->flags & __WQ_DYNAMIC)) { + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + } +#else /* there should only be single pwq for ordering guarantee */ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || - wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), - "ordering guarantee broken for workqueue %s\n", wq->name); + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); +#endif } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -5799,7 +5812,11 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) /* creating multiple pwqs breaks ordering guarantee */ if (!list_empty(&wq->pwqs)) { +#ifdef KWORKER_NUMA_AFFINITY + if (wq->flags & __WQ_ORDERED_EXPLICIT && !(wq->flags & __WQ_DYNAMIC)) +#else if (wq->flags & __WQ_ORDERED_EXPLICIT) +#endif continue; wq->flags &= ~__WQ_ORDERED; } diff --git a/lib/Kconfig.openeuler b/lib/Kconfig.openeuler new file mode 100644 index 000000000000..fc5fdf0f1123 --- /dev/null +++ b/lib/Kconfig.openeuler @@ -0,0 +1,8 @@ +config KWORKER_NUMA_AFFINITY + bool "kworker NUMA affinity" + default n + help + This feature implements a set of adaptive mechanisms so that the + workqueue can automatically identify the CPU of the soft interrupt + and automatically schedule the workqueue to the corresponding NUMA + node. -- Gitee From a4ff4312a58c51e94d01a4e310eb2b91562ca6ea Mon Sep 17 00:00:00 2001 From: Shao Denghui Date: Wed, 6 Mar 2024 16:30:57 +0800 Subject: [PATCH 2/3] iscsi: use dynamic single thread workqueue to improve performance euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94XYA CVE: NA ------------------------------------------------- On aarch64 NUMA machines, the kworker of iscsi created always jump around across node boundaries. If it work on the different node even different cpu package with the softirq of network interface, memcpy with in iscsi_tcp_segment_recv will be slow down, and iscsi got an terrible performance. In this patch, we trace the cpu of softirq, and tell queue_work_on to execute iscsi_xmitworker on the same NUMA node. Signed-off-by: Biaoxiang Ye Signed-off-by: shaodenghui --- drivers/scsi/iscsi_tcp.c | 13 +++++++++++++ drivers/scsi/libiscsi.c | 24 ++++++++++++++++++++++-- include/scsi/libiscsi.h | 1 + 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 8e14cea15f98..f7ae9de005ec 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -170,6 +170,9 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk) struct iscsi_sw_tcp_conn *tcp_sw_conn; struct iscsi_tcp_conn *tcp_conn; struct iscsi_conn *conn; +#ifdef KWORKER_NUMA_AFFINITY + int current_cpu; +#endif trace_sk_data_ready(sk); @@ -180,6 +183,16 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk) return; } tcp_conn = conn->dd_data; + +#ifdef KWORKER_NUMA_AFFINITY + /* save intimate cpu when in softirq */ + if (!sock_owned_by_user_nocheck(sk)) { + current_cpu = smp_processor_id(); + if (conn->intimate_cpu != current_cpu) + conn->intimate_cpu = current_cpu; + } +#endif + tcp_sw_conn = tcp_conn->dd_data; if (tcp_sw_conn->queue_recv) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 0fda8905eabd..edb732d60c90 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -89,9 +89,20 @@ inline void iscsi_conn_queue_xmit(struct iscsi_conn *conn) { struct Scsi_Host *shost = conn->session->host; struct iscsi_host *ihost = shost_priv(shost); +#ifdef KWORKER_NUMA_AFFINITY + int intimate_cpu = conn->intimate_cpu; + if (ihost->workq) { + /* we expect it to be excuted on the same numa of the intimate cpu */ + if ((intimate_cpu >= 0) && cpu_possible(intimate_cpu)) + queue_work_on(intimate_cpu, ihost->workq, &conn->xmitwork); + else + queue_work(ihost->workq, &conn->xmitwork); + } +#else if (ihost->workq) queue_work(ihost->workq, &conn->xmitwork); +#endif } EXPORT_SYMBOL_GPL(iscsi_conn_queue_xmit); @@ -2907,9 +2918,15 @@ struct Scsi_Host *iscsi_host_alloc(const struct scsi_host_template *sht, ihost = shost_priv(shost); if (xmit_can_sleep) { +#ifdef KWORKER_NUMA_AFFINITY + /* this kind of workqueue only support single work */ + ihost->workq = alloc_ordered_workqueue("iscsi_q_%d", __WQ_LEGACY | WQ_MEM_RECLAIM | + __WQ_DYNAMIC, shost->host_no); +#else ihost->workq = alloc_workqueue("iscsi_q_%d", - WQ_SYSFS | __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_UNBOUND, - 1, shost->host_no); + WQ_SYSFS | __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_UNBOUND, + 1, shost->host_no); +#endif if (!ihost->workq) goto free_host; } @@ -3190,6 +3207,9 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, conn->c_stage = ISCSI_CONN_INITIAL_STAGE; conn->id = conn_idx; conn->exp_statsn = 0; +#ifdef KWORKER_NUMA_AFFINITY + conn->intimate_cpu = -1; +#endif timer_setup(&conn->transport_timer, iscsi_check_transport_timeouts, 0); diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index d253b82e973e..ead17d2224a1 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -266,6 +266,7 @@ struct iscsi_conn { /* custom statistics */ uint32_t eh_abort_cnt; uint32_t fmr_unalign_cnt; + int intimate_cpu; KABI_RESERVE(1) KABI_RESERVE(2) -- Gitee From 066e7bd459688b272f83d8a78f999eabb4a08579 Mon Sep 17 00:00:00 2001 From: Shao Denghui Date: Wed, 6 Mar 2024 16:30:58 +0800 Subject: [PATCH 3/3] Add kernel compilation configuration options virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94XYA ---------------------------------------------------------------------- Add KWORKER_NUMA_AFFINITY to control scsi kworker code compile or not. Signed-off-by: Shao Denghui --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 2002fe8d9ea3..91577cbfb2f0 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1036,6 +1036,7 @@ CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y CONFIG_PID_MAX_PER_NAMESPACE=y CONFIG_FREEZER=y +CONFIG_KWORKER_NUMA_AFFINITY=y # # Executable file formats diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 627a08ecbf4a..987a13ca8d5f 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1033,6 +1033,7 @@ CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y CONFIG_PID_MAX_PER_NAMESPACE=y CONFIG_FREEZER=y +CONFIG_KWORKER_NUMA_AFFINITY=y # # Executable file formats -- Gitee