From 22eea3de0cb6cd30f427ce421508eb8d5e39d9ee Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Thu, 18 May 2023 21:02:14 +0800 Subject: [PATCH 01/22] net/smc: add sysctl interface for SMC mainline inclusion from mainline-v5.18-rc1 commit 462791bbfa350189e309a5a94541f6b63cd874e8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=462791bbfa350189e309a5a94541f6b63cd874e8 -------------------------------- This patch add sysctl interface to support container environment for SMC as we talk in the mail list. Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com Co-developed-by: Tony Lu Signed-off-by: Tony Lu Signed-off-by: Dust Li Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- include/net/net_namespace.h | 5 ++- include/net/netns/smc.h | 10 ++++++ net/smc/Makefile | 1 + net/smc/af_smc.c | 8 +++++ net/smc/smc_sysctl.c | 70 +++++++++++++++++++++++++++++++++++++ net/smc/smc_sysctl.h | 32 +++++++++++++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 include/net/netns/smc.h create mode 100644 net/smc/smc_sysctl.c create mode 100644 net/smc/smc_sysctl.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c7faca9d7447..576372924f3d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -190,7 +191,9 @@ struct net { struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; - +#if IS_ENABLED(CONFIG_SMC) + struct netns_smc smc; +#endif KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h new file mode 100644 index 000000000000..0a7d25a124e9 --- /dev/null +++ b/include/net/netns/smc.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_SMC_H__ +#define __NETNS_SMC_H__ + +struct netns_smc { +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif +}; +#endif diff --git a/net/smc/Makefile b/net/smc/Makefile index cb1254541f37..efee8fa4a14e 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o +smc-y += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cfae95bfac14..b68594bff6e4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -48,6 +48,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -2655,6 +2656,12 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_sysctl_init(); + if (rc) { + pr_err("%s: sysctl_init fails with %d\n", __func__, rc); + goto out_sock; + } + static_branch_enable(&tcp_have_smc); return 0; @@ -2683,6 +2690,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_sysctl_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..8a3a8e145976 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#include +#include +#include + +#include "smc_sysctl.h" + +static struct ctl_table smc_table[] = { + { } +}; + +static __net_init int smc_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void smc_sysctl_exit_net(struct net *net) +{ + unregister_net_sysctl_table(net->smc.smc_hdr); +} + +static struct pernet_operations smc_sysctl_ops __net_initdata = { + .init = smc_sysctl_init_net, + .exit = smc_sysctl_exit_net, +}; + +int __init smc_sysctl_init(void) +{ + return register_pernet_subsys(&smc_sysctl_ops); +} + +void smc_sysctl_exit(void) +{ + unregister_pernet_subsys(&smc_sysctl_ops); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 000000000000..49553ac236b6 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int smc_sysctl_init(void); +void smc_sysctl_exit(void); + +#else + +int smc_sysctl_init(void) +{ + return 0; +} + +void smc_sysctl_exit(void) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ -- Gitee From 152866e8fd471dcc966c2827c3ddfa0243d40361 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Thu, 18 May 2023 21:50:17 +0800 Subject: [PATCH 02/22] net/smc: fix compile warning for smc_sysctl mainline inclusion from mainline-v5.18-rc1 commit 7de8eb0d9039f16e1122d7aa524a1502a160c4ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=7de8eb0d9039f16e1122d7aa524a1502a160c4ff -------------------------------- kernel test robot reports multiple warning for smc_sysctl: In file included from net/smc/smc_sysctl.c:17: >> net/smc/smc_sysctl.h:23:5: warning: no previous prototype \ for function 'smc_sysctl_init' [-Wmissing-prototypes] int smc_sysctl_init(void) ^ and >> WARNING: modpost: vmlinux.o(.text+0x12ced2d): Section mismatch \ in reference from the function smc_sysctl_exit() to the variable .init.data:smc_sysctl_ops The function smc_sysctl_exit() references the variable __initdata smc_sysctl_ops. This is often because smc_sysctl_exit lacks a __initdata annotation or the annotation of smc_sysctl_ops is wrong. and net/smc/smc_sysctl.c: In function 'smc_sysctl_init_net': net/smc/smc_sysctl.c:47:17: error: 'struct netns_smc' has no member named 'smc_hdr' 47 | net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); Since we don't need global sysctl initialization. To make things clean and simple, remove the global pernet_operations and smc_sysctl_{init|exit}. Call smc_sysctl_net_{init|exit} directly from smc_net_{init|exit}. Also initialized sysctl_autocorking_size if CONFIG_SYSCTL it not set, this make sure SMC autocorking is enabled by default if CONFIG_SYSCTL is not set. Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC") Reported-by: kernel test robot Signed-off-by: Dust Li Tested-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 13 ++++++------- net/smc/smc_sysctl.c | 19 ++----------------- net/smc/smc_sysctl.h | 8 ++++---- 4 files changed, 13 insertions(+), 29 deletions(-) diff --git a/net/smc/Makefile b/net/smc/Makefile index efee8fa4a14e..79f53cc7d8dc 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -3,4 +3,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o -smc-y += smc_sysctl.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b68594bff6e4..3f347d2ab1bb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2569,11 +2569,17 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { + int rc; + + rc = smc_sysctl_net_init(net); + if (rc) + return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { + smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } @@ -2656,12 +2662,6 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_sysctl_init(); - if (rc) { - pr_err("%s: sysctl_init fails with %d\n", __func__, rc); - goto out_sock; - } - static_branch_enable(&tcp_have_smc); return 0; @@ -2690,7 +2690,6 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); - smc_sysctl_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 8a3a8e145976..d2cc2f5bf089 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -20,7 +20,7 @@ static struct ctl_table smc_table[] = { { } }; -static __net_init int smc_sysctl_init_net(struct net *net) +int __net_init smc_sysctl_net_init(struct net *net) { struct ctl_table *table; @@ -49,22 +49,7 @@ static __net_init int smc_sysctl_init_net(struct net *net) return -ENOMEM; } -static __net_exit void smc_sysctl_exit_net(struct net *net) +void __net_exit smc_sysctl_net_exit(struct net *net) { unregister_net_sysctl_table(net->smc.smc_hdr); } - -static struct pernet_operations smc_sysctl_ops __net_initdata = { - .init = smc_sysctl_init_net, - .exit = smc_sysctl_exit_net, -}; - -int __init smc_sysctl_init(void) -{ - return register_pernet_subsys(&smc_sysctl_ops); -} - -void smc_sysctl_exit(void) -{ - unregister_pernet_subsys(&smc_sysctl_ops); -} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 49553ac236b6..06af7393f3c2 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -15,17 +15,17 @@ #ifdef CONFIG_SYSCTL -int smc_sysctl_init(void); -void smc_sysctl_exit(void); +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net); #else -int smc_sysctl_init(void) +int __net_init smc_sysctl_net_init(struct net *net) { return 0; } -void smc_sysctl_exit(void) { } +void __net_exit smc_sysctl_net_exit(struct net *net) { } #endif /* CONFIG_SYSCTL */ -- Gitee From f39adb4cc7d5330936e87852ee8950360e5238a5 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Thu, 18 May 2023 22:01:25 +0800 Subject: [PATCH 03/22] net/smc: fix -Wmissing-prototypes warning when CONFIG_SYSCTL not set mainline inclusion from mainline-v5.18-rc1 commit d9f50991592513cc7633684cbaff65022cfa6816 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=d9f50991592513cc7633684cbaff65022cfa6816 -------------------------------- when CONFIG_SYSCTL not set, smc_sysctl_net_init/exit need to be static inline to avoid missing-prototypes if compile with W=1. Since __net_exit has noinline annotation when CONFIG_NET_NS not set, it should not be used with static inline. So remove the __net_init/exit when CONFIG_SYSCTL not set. Fixes: 7de8eb0d9039 ("net/smc: fix compile warning for smc_sysctl") Signed-off-by: Dust Li Link: https://lore.kernel.org/r/20220309033051.41893-1-dust.li@linux.alibaba.com Signed-off-by: Jakub Kicinski Signed-off-by: Litao Jiao --- net/smc/smc_sysctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 06af7393f3c2..f04699bc8bbc 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -20,12 +20,12 @@ void __net_exit smc_sysctl_net_exit(struct net *net); #else -int __net_init smc_sysctl_net_init(struct net *net) +static inline int smc_sysctl_net_init(struct net *net) { return 0; } -void __net_exit smc_sysctl_net_exit(struct net *net) { } +static inline void smc_sysctl_net_exit(struct net *net) { } #endif /* CONFIG_SYSCTL */ -- Gitee From 8e1559e6ff2c7ba7e81ff9954d7d64997f71f910 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Thu, 18 May 2023 22:18:12 +0800 Subject: [PATCH 04/22] net/smc: fix a memory leak in smc_sysctl_net_exit() mainline inclusion from mainline-v5.18-rc1 commit 5ae6acf1d00be462d7b08b4a8748798ef595ae5a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=5ae6acf1d00be462d7b08b4a8748798ef595ae5a -------------------------------- Recently added smc_sysctl_net_exit() forgot to free the memory allocated from smc_sysctl_net_init() for non initial network namespace. Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Tony Lu Cc: Dust Li Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc_sysctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index d2cc2f5bf089..e06ecf0e7c84 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -51,5 +51,10 @@ int __net_init smc_sysctl_net_init(struct net *net) void __net_exit smc_sysctl_net_exit(struct net *net) { + struct ctl_table *table; + + table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); + if (!net_eq(net, &init_net)) + kfree(table); } -- Gitee From eb4f17f1f6ecf522d70e3fc8c334f0a8563a129e Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Fri, 19 May 2023 10:42:01 +0800 Subject: [PATCH 05/22] net/smc: Introduce a sysctl for setting SMC-R buffer type mainline inclusion from mainline-v6.0-rc1 commit 4bc5008e4387106215b50ae1a4ac2467455725ca category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=4bc5008e4387106215b50ae1a4ac2467455725ca -------------------------------- This patch introduces the sysctl smcr_buf_type for setting the type of SMC-R sndbufs and RMBs. Valid values includes: - SMCR_PHYS_CONT_BUFS, which means use physically contiguous buffers for better performance and is the default value. - SMCR_VIRT_CONT_BUFS, which means use virtually contiguous buffers in case of physically contiguous memory is scarce. - SMCR_MIXED_BUFS, which means first try to use physically contiguous buffers. If not available, then use virtually contiguous buffers. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- Documentation/networking/smc-sysctl.rst | 21 +++++++++++++++++++++ include/net/netns/smc.h | 1 + net/smc/smc_core.h | 6 ++++++ net/smc/smc_sysctl.c | 14 ++++++++++++++ 4 files changed, 42 insertions(+) create mode 100644 Documentation/networking/smc-sysctl.rst diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst new file mode 100644 index 000000000000..3f0187ffc2a5 --- /dev/null +++ b/Documentation/networking/smc-sysctl.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +SMC Sysctl +========== + +/proc/sys/net/smc/* Variables +============================= + +smcr_buf_type - INTEGER + Controls which type of sndbufs and RMBs to use in later newly created + SMC-R link group. Only for SMC-R. + + Default: 0 (physically contiguous sndbufs and RMBs) + + Possible values: + + - 0 - Use physically contiguous buffers + - 1 - Use virtually contiguous buffers + - 2 - Mixed use of the two types. Try physically contiguous buffers first. + If not available, use virtually contiguous buffers then. diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 0a7d25a124e9..38396599938c 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,5 +6,6 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_smcr_buf_type; }; #endif diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 548271424ee5..316b535de85b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -199,6 +199,12 @@ enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; +enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e06ecf0e7c84..81faae0d1b7b 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -15,8 +15,20 @@ #include #include "smc_sysctl.h" +#include "smc_core.h" + +static int two = 2; static struct ctl_table smc_table[] = { + { + .procname = "smcr_buf_type", + .data = &init_net.smc.sysctl_smcr_buf_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { } }; @@ -40,6 +52,8 @@ int __net_init smc_sysctl_net_init(struct net *net) if (!net->smc.smc_hdr) goto err_reg; + net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + return 0; err_reg: -- Gitee From a4143dbfcee48de163974b9aedb4bcead521ab97 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Fri, 19 May 2023 15:06:24 +0800 Subject: [PATCH 06/22] net/smc: Use sysctl-specified types of buffers in new link group mainline inclusion from mainline-v6.0-rc1 commit b984f370ed5182d180f92dbf14bdf847ff6ccc04 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=b984f370ed5182d180f92dbf14bdf847ff6ccc04 -------------------------------- This patch introduces a new SMC-R specific element buf_type in struct smc_link_group, for recording the value of sysctl smcr_buf_type when link group is created. New created link group will create and reuse buffers of the type specified by buf_type. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc_core.c | 1 + net/smc/smc_core.h | 1 + 2 files changed, 2 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fdf66b4f9fb2..5b7c080fdae0 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -437,6 +437,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) goto free_wq; lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; + lgr->buf_type = sock_net(&smc->sk)->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 316b535de85b..f44a77d675f3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -260,6 +260,7 @@ struct smc_link_group { /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; + enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ -- Gitee From 9449318a0743e9b02554c7e4abc8bcbf88bb27ef Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Fri, 19 May 2023 16:35:28 +0800 Subject: [PATCH 07/22] net/smc: Allow SMC-D 1MB DMB allocations mainline inclusion from mainline-v5.15-rc1 commit 67161779a9ea926fccee8de047ae66cbd3482b91 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=67161779a9ea926fccee8de047ae66cbd3482b91 -------------------------------- Commit a3fe3d01bd0d7 ("net/smc: introduce sg-logic for RMBs") introduced a restriction for RMB allocations as used by SMC-R. However, SMC-D does not use scatter-gather lists to back its DMBs, yet it was limited by this restriction, still. This patch exempts SMC, but limits allocations to the maximum RMB/DMB size respectively. Signed-off-by: Stefan Raspl Signed-off-by: Guvenc Gulce Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc_core.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5b7c080fdae0..d3fbd13ffe90 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1381,21 +1381,30 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) return rc; } -/* convert the RMB size into the compressed notation - minimum 16K. +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size) +static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb) + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + return compressed; } @@ -1617,17 +1626,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, return rc; } -#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ - static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; - if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) - return ERR_PTR(-EAGAIN); - /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) @@ -1675,9 +1679,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(sk_buf_size); + for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { - if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; @@ -1686,8 +1689,6 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); - if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) - continue; /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); -- Gitee From f7ffd1b2110ffe2ff5432cf6260af831a4f12388 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 13 Nov 2023 16:15:09 +0800 Subject: [PATCH 08/22] net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R mainline inclusion from mainline-v6.0-rc1 commit b8d199451c99b3796b840c350eb74b830c5c869b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=b8d199451c99b3796b840c350eb74b830c5c869b -------------------------------- On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c qperf -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao Conflicts: net/smc/smc_core.c net/smc/af_smc.c --- net/smc/af_smc.c | 66 ++++++++++++-- net/smc/smc_clc.c | 8 +- net/smc/smc_clc.h | 2 +- net/smc/smc_core.c | 210 +++++++++++++++++++++++++++++++-------------- net/smc/smc_core.h | 10 ++- net/smc/smc_ib.c | 15 ++-- net/smc/smc_llc.c | 26 +++--- net/smc/smc_rx.c | 90 +++++++++++++++---- net/smc/smc_tx.c | 9 +- 9 files changed, 321 insertions(+), 115 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3f347d2ab1bb..b4d31796b657 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -378,6 +378,29 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } +/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + down_write(&lgr->llc_conf_lock); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; + } + up_write(&lgr->llc_conf_lock); + return rc; +} + /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) @@ -389,13 +412,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) return rc; /* protect against parallel smc_llc_cli_rkey_exchange() and - * parallel smcr_link_reg_rmb() + * parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_lock); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; - rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } @@ -441,8 +464,15 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; @@ -832,8 +862,15 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_ERR_REGRMB; + reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } @@ -1318,8 +1355,15 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); @@ -1647,8 +1691,14 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) struct smc_connection *conn = &new_smc->conn; if (!local_first) { + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + return SMC_CLC_DECL_ERR_REGBUF; } return 0; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 5ee5b2ce29a6..3f644be48d06 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -693,7 +693,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { @@ -705,8 +705,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, break; } clc->r0.rmbe_size = conn->rmbe_size_short; - clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index c579d1d5995a..8992949900e9 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -52,7 +52,7 @@ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d3fbd13ffe90..6a0acf65e788 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -605,35 +605,38 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, return NULL; } -static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { + struct rw_semaphore *lock; /* lock buffer list */ int rc; - if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ down_read(&lgr->llc_conf_lock); - smc_llc_do_delete_rkey(lgr, rmb_desc); - rmb_desc->is_conf_rkey = false; + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; up_read(&lgr->llc_conf_lock); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } - if (rmb_desc->is_reg_err) { + if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - down_write(&lgr->rmbs_lock); - list_del(&rmb_desc->list); - up_write(&lgr->rmbs_lock); + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + down_write(lock); + list_del(&buf_desc->list); + up_write(lock); - smc_buf_free(lgr, true, rmb_desc); + smc_buf_free(lgr, is_rmb, buf_desc); } else { /* memzero_explicit provides potential memory barrier semantics */ - memzero_explicit(rmb_desc->cpu_addr, rmb_desc->len); - WRITE_ONCE(rmb_desc->used, 0); + memzero_explicit(buf_desc->cpu_addr, buf_desc->len); + WRITE_ONCE(buf_desc->used, 0); } } @@ -641,15 +644,21 @@ static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) { - memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); - WRITE_ONCE(conn->sndbuf_desc->used, 0); + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + } else { + memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); + WRITE_ONCE(conn->sndbuf_desc->used, 0); + } } - if (conn->rmb_desc && lgr->is_smcd) { - memzero_explicit(conn->rmb_desc->cpu_addr, - conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); - WRITE_ONCE(conn->rmb_desc->used, 0); - } else if (conn->rmb_desc) { - smcr_buf_unuse(conn->rmb_desc, lgr); + if (conn->rmb_desc) { + if (!lgr->is_smcd) { + smcr_buf_unuse(conn->rmb_desc, true, lgr); + } else { + memzero_explicit(conn->rmb_desc->cpu_addr, + conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); + WRITE_ONCE(conn->rmb_desc->used, 0); + } } } @@ -682,20 +691,21 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - if (is_rmb) + if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) { - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - buf_desc->mr_rx[lnk->link_idx] = NULL; - } + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { + else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); - } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } @@ -763,8 +773,10 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - if (buf_desc->pages) + if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); kfree(buf_desc); } @@ -1446,26 +1458,49 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -/* map an rmb buf to a link */ +/* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - int rc; + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; - rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, buf_desc->len); + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { + if (rc != nents) { rc = -EAGAIN; goto free_table; } @@ -1473,11 +1508,13 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, buf_desc->is_dma_need_sync |= smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; @@ -1494,20 +1531,23 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, return rc; } -/* register a new rmb on IB device, - * must be called under lgr->llc_conf_mutex lock +/* register a new buf on IB device, rmb or vzalloced sndbuf + * must be called under lgr->llc_conf_lock lock */ -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; - if (!rmb_desc->is_reg_mr[link->link_idx]) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->is_reg_err = true; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->is_reg_mr[link->link_idx] = true; + buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } @@ -1559,18 +1599,39 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; + /* reg all RMBs for a new link */ down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; - rc = smcr_link_reg_rmb(lnk, buf_desc); - if (rc) - goto out; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + up_write(&lgr->rmbs_lock); + return rc; + } } } -out: + up_write(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + down_write(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + up_write(&lgr->sndbufs_lock); + return rc; + } + } + } + up_write(&lgr->sndbufs_lock); return rc; } @@ -1584,18 +1645,39 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM); - buf_desc->order = get_order(bufsize); - buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_COMP | - __GFP_NORETRY | __GFP_ZERO, - buf_desc->order); - if (!buf_desc->pages) { - kfree(buf_desc); - return ERR_PTR(-EAGAIN); - } - buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); - buf_desc->len = bufsize; + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, @@ -1718,7 +1800,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { - smcr_buf_unuse(buf_desc, lgr); + smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f44a77d675f3..dd09c4b01e91 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -150,9 +150,11 @@ struct smc_buf_desc { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf * incl. rkey provided to peer + * and lkey provided to local */ u32 order; /* allocation order */ @@ -165,6 +167,8 @@ struct smc_buf_desc { u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ }; struct { /* SMC-D */ unsigned short sba_idx; @@ -418,7 +422,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index c1d6084a416a..1413665f3115 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -398,7 +398,7 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], buf_slot->sgt[link_idx].sgl, buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); @@ -410,20 +410,21 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[link_idx]) + if (buf_slot->mr[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[link_idx] = + buf_slot->mr[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[link_idx])) { + if (IS_ERR(buf_slot->mr[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[link_idx]); - buf_slot->mr_rx[link_idx] = NULL; + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) return -EINVAL; return 0; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 18356f180c15..6b90bbafdba8 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -435,19 +435,22 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, if (smc_link_active(link) && link != send_link) { rkeyllc->rtoken[rtok_ix].link_id = link->link_id; rkeyllc->rtoken[rtok_ix].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); - rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address( - rmb_desc->sgt[link->link_idx].sgl)); + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); rtok_ix++; } } /* rkey of send_link is in rtoken[0] */ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); - rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); put_out: @@ -474,7 +477,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -724,9 +727,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, } rmb = *buf_pos; - addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - addc_llc->rt[i].rmb_vaddr_new = + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); (*num_rkeys_todo)--; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 6687354a9369..44583bf2df8e 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -143,35 +143,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd, static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct smc_sock *smc) { + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); + struct partial_page *partial; struct splice_pipe_desc spd; - struct partial_page partial; - struct smc_spd_priv *priv; - int bytes; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); if (!priv) - return -ENOMEM; - priv->len = len; - priv->smc = smc; - partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; - partial.len = len; - partial.private = (unsigned long)priv; - - spd.nr_pages_max = 1; - spd.nr_pages = 1; - spd.pages = &smc->conn.rmb_desc->pages; - spd.partial = &partial; + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; bytes = splice_to_pipe(pipe, &spd); if (bytes > 0) { sock_hold(&smc->sk); - get_page(smc->conn.rmb_desc->pages); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } atomic_add(bytes, &smc->conn.splice_pending); } + kfree(priv); + kfree(partial); + kfree(pages); return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; } static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index ca29268acf4d..3a4ce37c9f82 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -367,6 +367,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -379,7 +380,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, u64 base_addr = dma_addr; if (dst_len < link->qp_attr.cap.max_inline_data) { - base_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; + base_addr = virt_addr; wr->wr.send_flags |= IB_SEND_INLINE; } else { wr->wr.send_flags &= ~IB_SEND_INLINE; @@ -387,8 +388,12 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = base_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; -- Gitee From 0a6720edc3682f564ba3583a3f4612f1ca91ad62 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Sun, 21 May 2023 00:15:54 +0800 Subject: [PATCH 09/22] net/smc: Unbind r/w buffer size from clcsock and make them tunable mainline inclusion from mainline-v6.1-rc1 commit 0227f058aa29f5ab6f6ec79c3a36ae41f1e03a13 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=0227f058aa29f5ab6f6ec79c3a36ae41f1e03a13 -------------------------------- Currently, SMC uses smc->sk.sk_{rcv|snd}buf to create buffers for send buffer and RMB. And the values of buffer size are from tcp_{w|r}mem in clcsock. The buffer size from TCP socket doesn't fit SMC well. Generally, buffers are usually larger than TCP for SMC-R/-D to get higher performance, for they are different underlay devices and paths. So this patch unbinds buffer size from TCP, and introduces two sysctl knobs to tune them independently. Also, these knobs are per net namespace and work for containers. Signed-off-by: Tony Lu Signed-off-by: Paolo Abeni Signed-off-by: Litao Jiao --- Documentation/networking/smc-sysctl.rst | 18 ++++++++++++++++++ include/net/netns/smc.h | 2 ++ net/smc/af_smc.c | 5 +++-- net/smc/smc_core.c | 8 ++++---- net/smc/smc_sysctl.c | 21 ++++++++++++++++++++- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index 3f0187ffc2a5..e38c92ab46f3 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -19,3 +19,21 @@ smcr_buf_type - INTEGER - 1 - Use virtually contiguous buffers - 2 - Mixed use of the two types. Try physically contiguous buffers first. If not available, use virtually contiguous buffers then. + +wmem - INTEGER + Initial size of send buffer used by SMC sockets. + The default value inherits from net.ipv4.tcp_wmem[1]. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R and 1MiB for SMC-D. + + Default: 16K + +rmem - INTEGER + Initial size of receive buffer (RMB) used by SMC sockets. + The default value inherits from net.ipv4.tcp_rmem[1]. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R and 1MiB for SMC-D. + + Default: 128K diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 38396599938c..cded3f9a5081 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -7,5 +7,7 @@ struct netns_smc { struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_smcr_buf_type; + int sysctl_wmem; + int sysctl_rmem; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b4d31796b657..e4b6e8717138 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -267,6 +267,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { smc->tcp_listen_works[i].smc = smc; @@ -2602,8 +2604,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + out: return rc; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 6a0acf65e788..9968cfcdd9bb 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1756,10 +1756,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf / 2; + sk_buf_size = smc->sk.sk_rcvbuf; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf / 2; + sk_buf_size = smc->sk.sk_sndbuf; for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { @@ -1808,7 +1808,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize * 2; + smc->sk.sk_rcvbuf = bufsize; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -1816,7 +1816,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 81faae0d1b7b..a7cf6411d583 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -18,6 +18,8 @@ #include "smc_core.h" static int two = 2; +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; static struct ctl_table smc_table[] = { { @@ -29,6 +31,22 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "wmem", + .data = &init_net.smc.sysctl_wmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem", + .data = &init_net.smc.sysctl_rmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, { } }; @@ -53,7 +71,8 @@ int __net_init smc_sysctl_net_init(struct net *net) goto err_reg; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; - + WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); return 0; err_reg: -- Gitee From 4838ae72aec7b153b801a6f4551ec00f3044db6e Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 13 Nov 2023 16:20:32 +0800 Subject: [PATCH 10/22] net/smc: Add size match for smc_buf_get_slot sangfor inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC -------------------------------- This add the ability to get a unused smc_buf_desc based on the buf size which ensures that the size of obtained smc_buf_desc is the same as the size set. Signed-off-by: Litao Jiao --- net/smc/smc_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9968cfcdd9bb..8708ad7f05f8 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1432,7 +1432,7 @@ int smc_uncompress_bufsize(u8 compressed) /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, +static struct smc_buf_desc *smc_buf_get_slot(int bufsize, struct rw_semaphore *lock, struct list_head *buf_list) { @@ -1440,7 +1440,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { - if (cmpxchg(&buf_slot->used, 0, 1) == 0) { + if (buf_slot->len == bufsize && (cmpxchg(&buf_slot->used, 0, 1) == 0)) { up_read(lock); return buf_slot; } @@ -1773,7 +1773,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) bufsize = smc_uncompress_bufsize(bufsize_short); /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize, lock, buf_list); if (buf_desc) { break; /* found reusable slot */ } -- Gitee From 0b7bbc12d17ff97fd34f8457675f4967df1447ba Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 22 May 2023 20:32:16 +0800 Subject: [PATCH 11/22] net/smc: Tune the maximum size of virtually contiguous sndbufs or RMBs for SMC-R euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA -------------------------------- If the receiver application reads data slower than the sender, the sender may occur sending failures due to a full sndbufs, and the receiver may not process the rmb timely which results in the sender unable to send data immediately. Increasing the buffer size appropriately can help reduce the probability of the above problems and increase throughput. Therefore, tune the maximum size to 256M of virtually contiguous sndbufs or RMBs for SMC-R. Signed-off-by: Litao Jiao --- Documentation/networking/smc-sysctl.rst | 6 +++-- net/smc/smc_core.c | 32 +++++++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index e38c92ab46f3..5983b951077e 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -25,7 +25,8 @@ wmem - INTEGER The default value inherits from net.ipv4.tcp_wmem[1]. The minimum value is 16KiB and there is no hard limit for max value, but - only allowed 512KiB for SMC-R and 1MiB for SMC-D. + only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB + for SMC-R using other buf type and 1MiB for SMC-D. Default: 16K @@ -34,6 +35,7 @@ rmem - INTEGER The default value inherits from net.ipv4.tcp_rmem[1]. The minimum value is 16KiB and there is no hard limit for max value, but - only allowed 512KiB for SMC-R and 1MiB for SMC-D. + only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB + for SMC-R using other buf type and 1MiB for SMC-D. Default: 128K diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8708ad7f05f8..65e936f31972 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1393,29 +1393,41 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) return rc; } -#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ -#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 14 /* 0 -> 16KB, 1 -> 32KB, .. 14 -> 256MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) +static u8 smc_compress_bufsize(struct smc_link_group *lgr, int size, bool is_smcd, bool is_rmb) { const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; - u8 compressed; + u8 compressed, max_phy_compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; size = (size - 1) >> 14; /* convert to 16K multiple */ compressed = min_t(u8, ilog2(size) + 1, - is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); - - if (!is_smcd && is_rmb) - /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb && lgr->buf_type != SMCR_VIRT_CONT_BUFS) { + max_phy_compressed = ilog2(max_scat >> 14); + switch (lgr->buf_type) { + case SMCR_MIXED_BUFS: + if (compressed > max_phy_compressed) + break; + fallthrough; // try phys continguous buf + case SMCR_PHYS_CONT_BUFS: + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, max_phy_compressed); + break; + default: + break; + } + } return compressed; } @@ -1761,7 +1773,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf; - for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); + for (bufsize_short = smc_compress_bufsize(lgr, sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { lock = &lgr->rmbs_lock; -- Gitee From 358a1f208b492139aed8df195eb9636122e598cc Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Tue, 30 May 2023 19:21:49 +0800 Subject: [PATCH 12/22] net/smc: Use reserve space when adding struct netns_smc in struct net sangfor inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC -------------------------------- When adding struct netns_smc to struct net, the size of struct net could change, so use KABI_USE to fix the problem. Signed-off-by: Litao Jiao --- include/net/net_namespace.h | 5 ++-- net/smc/af_smc.c | 4 +-- net/smc/smc_core.c | 3 ++- net/smc/smc_core.h | 1 + net/smc/smc_sysctl.c | 51 +++++++++++++++++++++++++------------ net/smc/smc_sysctl.h | 18 ++++++++++++- 6 files changed, 60 insertions(+), 22 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 576372924f3d..d415ecbd8958 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -192,9 +192,10 @@ struct net { #endif struct sock *diag_nlsk; #if IS_ENABLED(CONFIG_SMC) - struct netns_smc smc; -#endif + KABI_USE(1, struct netns_smc *smc) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e4b6e8717138..5264f0e140d2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -267,8 +267,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); - WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); + WRITE_ONCE(sk->sk_sndbuf, sysctl_smcr_wmem(net)); + WRITE_ONCE(sk->sk_rcvbuf, sysctl_smcr_rmem(net)); smc = smc_sk(sk); for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { smc->tcp_listen_works[i].smc = smc; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 65e936f31972..2106a3498a60 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -30,6 +30,7 @@ #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" +#include "smc_sysctl.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -437,7 +438,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) goto free_wq; lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; - lgr->buf_type = sock_net(&smc->sk)->smc.sysctl_smcr_buf_type; + lgr->buf_type = sysctl_smcr_buf_type(sock_net(&smc->sk)); atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index dd09c4b01e91..2089e4d274a2 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -187,6 +187,7 @@ struct smc_rtoken { /* address/key of remote RMB */ }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_RMB_DEF_SIZE 131072 /* default size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a7cf6411d583..ac9aaa93a7b4 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -24,7 +24,6 @@ static int min_rcvbuf = SMC_BUF_MIN_SIZE; static struct ctl_table smc_table[] = { { .procname = "smcr_buf_type", - .data = &init_net.smc.sysctl_smcr_buf_type, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, @@ -33,7 +32,6 @@ static struct ctl_table smc_table[] = { }, { .procname = "wmem", - .data = &init_net.smc.sysctl_wmem, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -41,7 +39,6 @@ static struct ctl_table smc_table[] = { }, { .procname = "rmem", - .data = &init_net.smc.sysctl_rmem, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -50,34 +47,55 @@ static struct ctl_table smc_table[] = { { } }; +int sysctl_smcr_buf_type(struct net *net) +{ + return READ_ONCE(net->smc->sysctl_smcr_buf_type); +} + +int sysctl_smcr_wmem(struct net *net) +{ + return READ_ONCE(net->smc->sysctl_wmem); +} + +int sysctl_smcr_rmem(struct net *net) +{ + return READ_ONCE(net->smc->sysctl_rmem); +} + int __net_init smc_sysctl_net_init(struct net *net) { struct ctl_table *table; + int idx; table = smc_table; + net->smc = kmalloc(sizeof(*net->smc), GFP_KERNEL); + if (!net->smc) + goto err_alloc; if (!net_eq(net, &init_net)) { - int i; - table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); if (!table) - goto err_alloc; - - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) - table[i].data += (void *)net - (void *)&init_net; + goto err_table; } - net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); - if (!net->smc.smc_hdr) + idx = 0; + net->smc->sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + table[idx++].data = &net->smc->sysctl_smcr_buf_type; + WRITE_ONCE(net->smc->sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + table[idx++].data = &net->smc->sysctl_wmem; + WRITE_ONCE(net->smc->sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + table[idx++].data = &net->smc->sysctl_rmem; + + net->smc->smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc->smc_hdr) goto err_reg; - net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; - WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); - WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); +err_table: + kfree(net->smc); err_alloc: return -ENOMEM; } @@ -86,8 +104,9 @@ void __net_exit smc_sysctl_net_exit(struct net *net) { struct ctl_table *table; - table = net->smc.smc_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->smc.smc_hdr); + table = net->smc->smc_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->smc->smc_hdr); if (!net_eq(net, &init_net)) kfree(table); + kfree(net->smc); } diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index f04699bc8bbc..bc0893af5c47 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -14,11 +14,27 @@ #define _SMC_SYSCTL_H #ifdef CONFIG_SYSCTL - +int sysctl_smcr_buf_type(struct net *net); +int sysctl_smcr_wmem(struct net *net); +int sysctl_smcr_rmem(struct net *net); int __net_init smc_sysctl_net_init(struct net *net); void __net_exit smc_sysctl_net_exit(struct net *net); #else +static inline int sysctl_smcr_buf_type(struct net *net) +{ + return SMCR_PHYS_CONT_BUFS; +} + +static inline int sysctl_smcr_wmem(struct net *net) +{ + return SMC_BUF_MIN_SIZE; +} + +static inline int sysctl_smcr_rmem(struct net *net) +{ + return SMC_RMB_DEF_SIZE; +} static inline int smc_sysctl_net_init(struct net *net) { -- Gitee From 730ec86fe540f13bb0286c978916392bf48e7473 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Tue, 7 Nov 2023 13:43:03 +0800 Subject: [PATCH 13/22] anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace anolis inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/d4f40dd9761c7a3dcd8e7f83abbf70eb387fbf01 -------------------------------- ANBZ: #5550 This patch exposes SMCPROTO_SMC and SMCPROTO_SMC6 to userspace by moving them to in.h and in6.h. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1754 Signed-off-by: Litao Jiao --- include/uapi/linux/in.h | 3 +++ include/uapi/linux/in6.h | 2 ++ net/smc/smc.h | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 3960bc3da6b3..8b9bdc70d270 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,6 +84,9 @@ enum { }; #endif +/* SMC protocol, IPv4 */ +#define SMCPROTO_SMC 0 + #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 5ad396a57eb3..6c21c85be0e3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,6 +95,8 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 +/* SMC protocol, IPv6 */ +#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/smc.h b/net/smc/smc.h index 6fe4cf14dc27..8ea5b8fb9e99 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,10 +21,6 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 - -#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ -#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ - #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From 837f1d71f8aec89b285e354ddc246b8e7c45101c Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Tue, 7 Nov 2023 15:11:41 +0800 Subject: [PATCH 14/22] anolis: net/smc: Introduce sysctl tcp2smc anolis inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/834a999fa7461d43ce2bf183ab9de7a5245baf2e -------------------------------- ANBZ: #5550 This patch adds sysctl 'tcp2smc' to provide a switch for replacing TCP to SMC-R when new sockets are created in a specific net namespace. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Reviewed-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/1754 Signed-off-by: Litao Jiao --- include/net/netns/smc.h | 1 + net/smc/smc_sysctl.c | 8 ++++++++ net/socket.c | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index cded3f9a5081..db3d4e704d9c 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -9,5 +9,6 @@ struct netns_smc { unsigned int sysctl_smcr_buf_type; int sysctl_wmem; int sysctl_rmem; + int sysctl_tcp2smc; }; #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index ac9aaa93a7b4..f8a97d558830 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -44,6 +44,12 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, + { + .procname = "tcp2smc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -84,6 +90,8 @@ int __net_init smc_sysctl_net_init(struct net *net) table[idx++].data = &net->smc->sysctl_wmem; WRITE_ONCE(net->smc->sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); table[idx++].data = &net->smc->sysctl_rmem; + net->smc->sysctl_tcp2smc = 0; + table[idx++].data = &net->smc->sysctl_tcp2smc; net->smc->smc_hdr = register_net_sysctl(net, "net/smc", table); if (!net->smc->smc_hdr) diff --git a/net/socket.c b/net/socket.c index 42c1bbe84236..4b5d80486fac 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1369,6 +1369,15 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } +#if IS_ENABLED(CONFIG_SMC) + if (!kern && net->smc && net->smc->sysctl_tcp2smc && + (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP)) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } +#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From a307b2b1b76bb307e198f6db5ac9651e0f176b7a Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Tue, 14 Nov 2023 10:24:05 +0800 Subject: [PATCH 15/22] net/smc: Fix setsockopt and sysctl to specify same buffer size again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.5-rc6 commit 833bac7ec392bf75053c8a4fa4c36d4148dac77d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=833bac7ec392bf75053c8a4fa4c36d4148dac77d -------------------------------- Commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") introduced the net.smc.rmem and net.smc.wmem sysctls to specify the size of buffers to be used for SMC type connections. This created a regression for users that specified the buffer size via setsockopt() as the effective buffer size was now doubled. Re-introduce the division by 2 in the SMC buffer create code and level this out by duplicating the net.smc.[rw]mem values used for initializing sk_rcvbuf/sk_sndbuf at socket creation time. This gives users of both methods (setsockopt or sysctl) the effective buffer size that they expect. Initialize net.smc.[rw]mem from its own constant of 64kB, respectively. Internal performance tests show that this value is a good compromise between throughput/latency and memory consumption. Also, this decouples it from any tuning that was done to net.ipv4.tcp_[rw]mem[1] before the module for SMC protocol was loaded. Check that no more than INT_MAX / 2 is assigned to net.smc.[rw]mem, in order to avoid any overflow condition when that is doubled for use in sk_sndbuf or sk_rcvbuf. While at it, drop the confusing sk_buf_size variable from __smc_buf_create and name "compressed" buffer size variables more consistently. Background: Before the commit mentioned above, SMC's buffer allocator in __smc_buf_create() always used half of the sockets' sk_rcvbuf/sk_sndbuf value as initial value to search for appropriate buffers. If the search resorted to using a bigger buffer when all buffers of the specified size were busy, the duplicate of the used effective buffer size is stored back to sk_rcvbuf/sk_sndbuf. When available, buffers of exactly the size that a user had specified as input to setsockopt() were used, despite setsockopt()'s documentation in "man 7 socket" talking of a mandatory duplication: [...] SO_SNDBUF Sets or gets the maximum socket send buffer in bytes. The kernel doubles this value (to allow space for book‐ keeping overhead) when it is set using setsockopt(2), and this doubled value is returned by getsockopt(2). The default value is set by the /proc/sys/net/core/wmem_default file and the maximum allowed value is set by the /proc/sys/net/core/wmem_max file. The minimum (doubled) value for this option is 2048. [...] Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") Co-developed-by: Jan Karcher Signed-off-by: Jan Karcher Reviewed-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller Signed-off-by: Litao Jiao Conflicts: net/smc/smc_core.c net/smc/smc_sysctl.c --- net/smc/af_smc.c | 4 ++-- net/smc/smc.h | 2 +- net/smc/smc_clc.c | 4 ++-- net/smc/smc_core.c | 21 ++++++++++----------- net/smc/smc_sysctl.c | 10 ++++++++-- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5264f0e140d2..2d8bf7fec6e0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -267,8 +267,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - WRITE_ONCE(sk->sk_sndbuf, sysctl_smcr_wmem(net)); - WRITE_ONCE(sk->sk_rcvbuf, sysctl_smcr_rmem(net)); + WRITE_ONCE(sk->sk_sndbuf, 2 * sysctl_smcr_wmem(net)); + WRITE_ONCE(sk->sk_rcvbuf, 2 * sysctl_smcr_rmem(net)); smc = smc_sk(sk); for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { smc->tcp_listen_works[i].smc = smc; diff --git a/net/smc/smc.h b/net/smc/smc.h index 8ea5b8fb9e99..691d110fdb95 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -140,7 +140,7 @@ struct smc_connection { struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size_short;/* compressed notation */ + int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3f644be48d06..c9450ab0e23b 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -658,7 +658,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = conn->lgr->smcd->local_gid; clc->d0.token = conn->rmb_desc->token; - clc->d0.dmbe_size = conn->rmbe_size_short; + clc->d0.dmbe_size = conn->rmbe_size_comp; clc->d0.dmbe_idx = 0; memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); if (version == SMC_V1) { @@ -704,7 +704,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); break; } - clc->r0.rmbe_size = conn->rmbe_size_short; + clc->r0.rmbe_size = conn->rmbe_size_comp; clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : cpu_to_be64((u64)sg_dma_address diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2106a3498a60..dcd884a3cb28 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1763,27 +1763,26 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; - int bufsize, bufsize_short; + int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ - int sk_buf_size; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf; + bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf; + bufsize = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(lgr, sk_buf_size, is_smcd, is_rmb); - bufsize_short >= 0; bufsize_short--) { + for (bufsize_comp = smc_compress_bufsize(lgr, bufsize, is_smcd, is_rmb); + bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; - buf_list = &lgr->rmbs[bufsize_short]; + buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; - buf_list = &lgr->sndbufs[bufsize_short]; + buf_list = &lgr->sndbufs[bufsize_comp]; } - bufsize = smc_uncompress_bufsize(bufsize_short); + bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize, lock, buf_list); @@ -1820,8 +1819,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize; + conn->rmbe_size_comp = bufsize_comp; + smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index f8a97d558830..5360a9446202 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -20,6 +20,10 @@ static int two = 2; static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int max_sndbuf = INT_MAX / 2; +static int max_rcvbuf = INT_MAX / 2; +static const int net_smc_wmem_init = (64 * 1024); +static const int net_smc_rmem_init = (64 * 1024); static struct ctl_table smc_table[] = { { @@ -36,6 +40,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, + .extra2 = &max_sndbuf, }, { .procname = "rmem", @@ -43,6 +48,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, + .extra2 = &max_rcvbuf, }, { .procname = "tcp2smc", @@ -86,9 +92,9 @@ int __net_init smc_sysctl_net_init(struct net *net) idx = 0; net->smc->sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; table[idx++].data = &net->smc->sysctl_smcr_buf_type; - WRITE_ONCE(net->smc->sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc->sysctl_wmem, net_smc_wmem_init); table[idx++].data = &net->smc->sysctl_wmem; - WRITE_ONCE(net->smc->sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + WRITE_ONCE(net->smc->sysctl_rmem, net_smc_rmem_init); table[idx++].data = &net->smc->sysctl_rmem; net->smc->sysctl_tcp2smc = 0; table[idx++].data = &net->smc->sysctl_tcp2smc; -- Gitee From d14cdbd5bb85d55541399e4811e297d11467a870 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Tue, 14 Nov 2023 10:47:36 +0800 Subject: [PATCH 16/22] net/smc: Use correct buffer sizes when switching between TCP and SMC mainline inclusion from mainline-v6.5-rc6 commit 30c3c4a4497c3765bf6b298f5072c8165aeaf7cc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=30c3c4a4497c3765bf6b298f5072c8165aeaf7cc -------------------------------- Tuning of the effective buffer size through setsockopts was working for SMC traffic only but not for TCP fall-back connections even before commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable"). That change made it apparent that TCP fall-back connections would use net.smc.[rw]mem as buffer size instead of net.ipv4_tcp_[rw]mem. Amend the code that copies attributes between the (TCP) clcsock and the SMC socket and adjust buffer sizes appropriately: - Copy over sk_userlocks so that both sockets agree on whether tuning via setsockopt is active. - When falling back to TCP use sk_sndbuf or sk_rcvbuf as specified with setsockopt. Otherwise, use the sysctl value for TCP/IPv4. - Likewise, use either values from setsockopt or from sysctl for SMC (duplicated) on successful SMC connect. In smc_tcp_listen_work() drop the explicit copy of buffer sizes as that is taken care of by the attribute copy. Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") Reviewed-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller Signed-off-by: jiaolitao48147 <48147@sangfor.com> Signed-off-by: jiaolitao48147 <48147@sangfor.com> Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 73 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2d8bf7fec6e0..cbc499e996f4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -328,13 +328,60 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, return rc; } +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ + +#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_BROADCAST) | \ + (1UL << SOCK_TIMESTAMP) | \ + (1UL << SOCK_DBG) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVTSTAMPNS) | \ + (1UL << SOCK_LOCALROUTE) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_WIFI_STATUS) | \ + (1UL << SOCK_NOFCS) | \ + (1UL << SOCK_FILTER_LOCKED) | \ + (1UL << SOCK_TSTAMP_NEW)) + +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + struct net *nnet = sock_net(nsk); + + nsk->sk_userlocks = osk->sk_userlocks; + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + nsk->sk_sndbuf = osk->sk_sndbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_sndbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); + else + WRITE_ONCE(nsk->sk_sndbuf, + 2 * sysctl_smcr_wmem(nnet)); + } + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + nsk->sk_rcvbuf = osk->sk_rcvbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_rcvbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); + else + WRITE_ONCE(nsk->sk_rcvbuf, + 2 * sysctl_smcr_rmem(nnet)); + } +} + static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, unsigned long mask) { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; - nsk->sk_sndbuf = osk->sk_sndbuf; - nsk->sk_rcvbuf = osk->sk_rcvbuf; nsk->sk_sndtimeo = osk->sk_sndtimeo; nsk->sk_rcvtimeo = osk->sk_rcvtimeo; nsk->sk_mark = osk->sk_mark; @@ -345,26 +392,10 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, nsk->sk_flags &= ~mask; nsk->sk_flags |= osk->sk_flags & mask; + + smc_adjust_sock_bufsizes(nsk, osk, mask); } -#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ - (1UL << SOCK_KEEPOPEN) | \ - (1UL << SOCK_LINGER) | \ - (1UL << SOCK_BROADCAST) | \ - (1UL << SOCK_TIMESTAMP) | \ - (1UL << SOCK_DBG) | \ - (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVTSTAMPNS) | \ - (1UL << SOCK_LOCALROUTE) | \ - (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ - (1UL << SOCK_RXQ_OVFL) | \ - (1UL << SOCK_WIFI_STATUS) | \ - (1UL << SOCK_NOFCS) | \ - (1UL << SOCK_FILTER_LOCKED) | \ - (1UL << SOCK_TSTAMP_NEW)) -/* copy only relevant settings and flags of SOL_SOCKET level from smc to - * clc socket (since smc is not called for these options from net/core) - */ static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); @@ -1929,8 +1960,6 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; - new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); -- Gitee From 82496d54f2db5ea5c42a546f291040ad9916201a Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Sat, 25 Nov 2023 17:32:43 +0800 Subject: [PATCH 17/22] net/smc: Forward wakeup to smc socket waitqueue after fallback mainline inclusion from mainline-v5.17-rc3 commit 341adeec9adad0874f29a0a1af35638207352a39 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=341adeec9adad0874f29a0a1af35638207352a39 -------------------------------- When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1 ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 132 ++++++++++++++++++++++++++++++++++++++++++----- net/smc/smc.h | 20 ++++++- 2 files changed, 137 insertions(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cbc499e996f4..2d43ba4735ed 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -605,11 +605,109 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->r0.qp_mtu; } +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); +} + static void smc_switch_to_fallback(struct smc_sock *smc) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); - unsigned long flags; + struct sock *clcsk; + + clcsk = smc->clcsock->sk; smc->use_fallback = true; if (smc->sk.sk_socket && smc->sk.sk_socket->file) { @@ -618,15 +716,22 @@ static void smc_switch_to_fallback(struct smc_sock *smc) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. */ - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_fback_state_change; + clcsk->sk_data_ready = smc_fback_data_ready; + clcsk->sk_write_space = smc_fback_write_space; + clcsk->sk_error_report = smc_fback_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } } @@ -1972,10 +2077,9 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc; + struct smc_sock *lsmc = + smc_clcsock_user_data(listen_clcsock); - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); if (!lsmc) return; lsmc->clcsk_data_ready(listen_clcsock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 691d110fdb95..3e40ee2d8223 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -126,6 +126,12 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -226,8 +232,14 @@ struct smc_tcp_listen_work { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. **/ + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -264,6 +276,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ -- Gitee From 1dd3c566dd838ef2d1f8fcee9b613223914fdcf0 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 27 Nov 2023 16:25:12 +0800 Subject: [PATCH 18/22] net/smc: fix documentation of buffer sizes mainline inclusion from mainline-v6.7-rc1 commit a1602d749097386ec9e8e411a16a9c37ff6cd5fc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=a1602d749097386ec9e8e411a16a9c37ff6cd5fc -------------------------------- Since commit 833bac7ec392 ("net/smc: Fix setsockopt and sysctl to specify same buffer size again") the SMC protocol uses its own default values for the smc.rmem and smc.wmem sysctl variables which are no longer derived from the TCP IPv4 buffer sizes. Fixup the kernel documentation to reflect this change, too. Fixes: 833bac7ec392 ("net/smc: Fix setsockopt and sysctl to specify same buffer size again") Signed-off-by: Gerd Bayer Reviewed-by: Wenjia Zhang Reviewed-by: Dust Li Link: https://lore.kernel.org/r/20231030170343.748097-1-gbayer@linux.ibm.com Signed-off-by: Paolo Abeni Signed-off-by: Litao Jiao --- Documentation/networking/smc-sysctl.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index 5983b951077e..fc08b26ca7be 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -22,20 +22,18 @@ smcr_buf_type - INTEGER wmem - INTEGER Initial size of send buffer used by SMC sockets. - The default value inherits from net.ipv4.tcp_wmem[1]. The minimum value is 16KiB and there is no hard limit for max value, but only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB for SMC-R using other buf type and 1MiB for SMC-D. - Default: 16K + Default: 64KiB rmem - INTEGER Initial size of receive buffer (RMB) used by SMC sockets. - The default value inherits from net.ipv4.tcp_rmem[1]. The minimum value is 16KiB and there is no hard limit for max value, but only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB for SMC-R using other buf type and 1MiB for SMC-D. - Default: 128K + Default: 64KiB -- Gitee From cfaf3c09ba6da4d0c33da271b5995fd5c1ece5fc Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 27 Nov 2023 17:22:57 +0800 Subject: [PATCH 19/22] net/smc: Transitional solution for clcsock race issue mainline inclusion from mainline-v5.17-rc2 commit c0bf3d8a943b6f2e912b7c1de03e2ef28e76f760 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=c0bf3d8a943b6f2e912b7c1de03e2ef28e76f760 -------------------------------- We encountered a crash in smc_setsockopt() and it is caused by accessing smc->clcsock after clcsock was released. BUG: kernel NULL pointer dereference, address: 0000000000000020 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 50309 Comm: nginx Kdump: loaded Tainted: G E 5.16.0-rc4+ #53 RIP: 0010:smc_setsockopt+0x59/0x280 [smc] Call Trace: __sys_setsockopt+0xfc/0x190 __x64_sys_setsockopt+0x20/0x30 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f16ba83918e This patch tries to fix it by holding clcsock_release_lock and checking whether clcsock has already been released before access. In case that a crash of the same reason happens in smc_getsockopt() or smc_switch_to_fallback(), this patch also checkes smc->clcsock in them too. And the caller of smc_switch_to_fallback() will identify whether fallback succeeds according to the return value. Fixes: fd57770dd198 ("net/smc: wait for pending work before clcsock release_sock") Link: https://lore.kernel.org/lkml/5dd7ffd1-28e2-24cc-9442-1defec27375e@linux.ibm.com/T/ Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 62 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2d43ba4735ed..00e367bf91a7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -703,10 +703,15 @@ static void smc_fback_error_report(struct sock *clcsk) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); } -static void smc_switch_to_fallback(struct smc_sock *smc) +static int smc_switch_to_fallback(struct smc_sock *smc) { struct sock *clcsk; + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } clcsk = smc->clcsock->sk; smc->use_fallback = true; @@ -733,12 +738,21 @@ static void smc_switch_to_fallback(struct smc_sock *smc) smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } + mutex_unlock(&smc->clcsock_release_lock); + return 0; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc); + int rc = 0; + + rc = smc_switch_to_fallback(smc); + if (rc) { /* fallback fails */ + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; @@ -1586,11 +1600,12 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); - if (reason_code < 0) { /* error, no fallback possible */ + if (reason_code < 0 || + smc_switch_to_fallback(new_smc)) { + /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { @@ -1951,9 +1966,13 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; - smc_listen_out_connected(new_smc); + rc = smc_switch_to_fallback(new_smc); + if (rc) { + smc_listen_out_err(new_smc); + } else { + new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; + smc_listen_out_connected(new_smc); + } return; } @@ -2237,7 +2256,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); + rc = smc_switch_to_fallback(smc); + if (rc) + goto out; smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; @@ -2446,6 +2467,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2455,6 +2481,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2471,8 +2498,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + rc = smc_switch_to_fallback(smc); + if (!rc) + smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; } @@ -2513,13 +2541,23 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; + int rc; smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } /* socket options apply to the CLC socket */ - if (unlikely(!smc->clcsock->ops->getsockopt)) + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; - return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + } + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, -- Gitee From 49909147f19e9b14485fbd7842251855f40ca10b Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 27 Nov 2023 17:52:56 +0800 Subject: [PATCH 20/22] net/smc: Avoid overwriting the copies of clcsock callback functions mainline inclusion from mainline-v5.17-rc5 commit 1de9770d121ee9294794cca0e0be8fbfa0134ee8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=1de9770d121ee9294794cca0e0be8fbfa0134ee8 -------------------------------- The callback functions of clcsock will be saved and replaced during the fallback. But if the fallback happens more than once, then the copies of these callback functions will be overwritten incorrectly, resulting in a loop call issue: clcsk->sk_error_report |- smc_fback_error_report() <------------------------------| |- smc_fback_forward_wakeup() | (loop) |- clcsock_callback() (incorrectly overwritten) | |- smc->clcsk_error_report() ------------------| So this patch fixes the issue by saving these function pointers only once in the fallback and avoiding overwriting. Reported-by: syzbot+4de3c0e8a263e1e499bc@syzkaller.appspotmail.com Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Link: https://lore.kernel.org/r/0000000000006d045e05d78776f6@google.com Signed-off-by: Wen Gu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 00e367bf91a7..11fabbbe041d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -706,14 +706,17 @@ static void smc_fback_error_report(struct sock *clcsk) static int smc_switch_to_fallback(struct smc_sock *smc) { struct sock *clcsk; + int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; + rc = -EBADF; + goto out; } clcsk = smc->clcsock->sk; + if (smc->use_fallback) + goto out; smc->use_fallback = true; if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; @@ -738,8 +741,9 @@ static int smc_switch_to_fallback(struct smc_sock *smc) smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } +out: mutex_unlock(&smc->clcsock_release_lock); - return 0; + return rc; } /* fall back during connect */ -- Gitee From 4440af3a6f0fe185c4b176736af405bcda6195c3 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 27 Nov 2023 19:43:32 +0800 Subject: [PATCH 21/22] net/smc: Only save the original clcsock callback functions mainline inclusion from mainline-v5.18-rc5 commit 97b9af7a70936e331170c79040cc9bf20071b566 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=97b9af7a70936e331170c79040cc9bf20071b566 -------------------------------- Both listen and fallback process will save the current clcsock callback functions and establish new ones. But if both of them happen, the saved callback functions will be overwritten. So this patch introduces some helpers to ensure that only save the original callback functions of clcsock. Fixes: 341adee ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 56 ++++++++++++++++++++++++++++++--------------- net/smc/smc.h | 29 +++++++++++++++++++++++ net/smc/smc_close.c | 3 ++- 3 files changed, 68 insertions(+), 20 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 11fabbbe041d..7ecedbfad707 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -283,6 +283,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); return sk; } @@ -703,9 +704,24 @@ static void smc_fback_error_report(struct sock *clcsk) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); } +static void smc_fback_replace_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); +} + static int smc_switch_to_fallback(struct smc_sock *smc) { - struct sock *clcsk; int rc = 0; mutex_lock(&smc->clcsock_release_lock); @@ -713,10 +729,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc) rc = -EBADF; goto out; } - clcsk = smc->clcsock->sk; - if (smc->use_fallback) - goto out; smc->use_fallback = true; if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; @@ -728,18 +741,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc) * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_fback_state_change; - clcsk->sk_data_ready = smc_fback_data_ready; - clcsk->sk_write_space = smc_fback_write_space; - clcsk->sk_error_report = smc_fback_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_fback_replace_callbacks(smc); } out: mutex_unlock(&smc->clcsock_release_lock); @@ -1418,6 +1420,19 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -2144,13 +2159,16 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ - smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; - smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); + rc = kernel_listen(smc->clcsock, backlog); if (rc) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc.h b/net/smc/smc.h index 3e40ee2d8223..3eafd44e363f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -276,12 +276,41 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ea2cf4ff7208..f3b6b27310de 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -215,7 +215,8 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } -- Gitee From 694d7a9216b242c86eb2e05fd1e2f99fcf3fcfb0 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 27 Nov 2023 19:57:18 +0800 Subject: [PATCH 22/22] net/smc: Fix slab-out-of-bounds issue in fallback mainline inclusion from mainline-v5.18-rc5 commit 0558226cebee256aa3f8ec0cc5a800a10bf120a6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JHC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=0558226cebee256aa3f8ec0cc5a800a10bf120a6 -------------------------------- syzbot reported a slab-out-of-bounds/use-after-free issue, which was caused by accessing an already freed smc sock in fallback-specific callback functions of clcsock. This patch fixes the issue by restoring fallback-specific callback functions to original ones and resetting clcsock sk_user_data to NULL before freeing smc sock. Meanwhile, this patch introduces sk_callback_lock to make the access and assignment to sk_user_data mutually exclusive. Reported-by: syzbot+b425899ed22c6943e00b@syzkaller.appspotmail.com Fixes: 341adee ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Link: https://lore.kernel.org/r/00000000000013ca8105d7ae3ada@google.com/ Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 80 ++++++++++++++++++++++++++++++++------------- net/smc/smc_close.c | 2 ++ 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7ecedbfad707..96c5976b5cfd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -146,11 +146,27 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); } } @@ -666,48 +682,57 @@ static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, static void smc_fback_state_change(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_data_ready(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_write_space(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_error_report(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; + write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, @@ -718,6 +743,8 @@ static void smc_fback_replace_callbacks(struct smc_sock *smc) &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc) @@ -2115,11 +2142,12 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc = - smc_clcsock_user_data(listen_clcsock); + struct smc_sock *lsmc; + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) - return; + goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { int idx = atomic_fetch_inc(&lsmc->tcp_listen_work_seq) % @@ -2128,6 +2156,8 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_works[idx].work)) sock_put(&lsmc->sk); } +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -2159,16 +2189,20 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_listen(smc->clcsock, backlog); if (rc) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index f3b6b27310de..7ee6d6102dc1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -215,9 +215,11 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); -- Gitee