From 19ef73e22323e68e8caeefca75779b5e288123fb Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 15:19:52 +0800 Subject: [PATCH 001/148] Revert "net/smc: Keep first contact clcsock" This reverts commit d91580f22d2a89dac0b525d5fc19c21e0d5da15c. --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 5 +---- net/smc/smc.h | 1 - net/smc/smc_close.c | 12 +----------- net/smc/smc_core.c | 13 ------------- net/smc/smc_core.h | 28 ---------------------------- net/smc/smc_llc.c | 3 --- net/smc/smc_sysctl.c | 9 --------- 8 files changed, 2 insertions(+), 70 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index a31a6390c629..a5c86e4402a6 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -29,7 +29,6 @@ struct netns_smc { int sysctl_autocorking; int sysctl_allow_different_subnet; bool limit_smc_hs; /* constraint on handshake */ - int sysctl_keep_first_contact_clcsock; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 503f7df8d21f..94b94601bd1c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -338,7 +338,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_sndbuf = net->smc.sysctl_wmem_default; sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); - smc->first_contact_local = 0; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -2678,7 +2677,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock && !smc->first_contact_local) + if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; @@ -3174,7 +3173,6 @@ static __net_init int smc_net_init(struct net *net) net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 0; net->smc.sysctl_autocorking = 1; - net->smc.sysctl_keep_first_contact_clcsock = 1; } return smc_pnet_net_init(net); @@ -3322,7 +3320,6 @@ static int __init smc_init(void) init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_allow_different_subnet = 0; init_net.smc.sysctl_autocorking = 1; - init_net.smc.sysctl_keep_first_contact_clcsock = 1; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc.h b/net/smc/smc.h index f5edc13955b8..770640a0146b 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -249,7 +249,6 @@ struct smc_sock { /* smc sock container */ /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - bool first_contact_local; struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 34df4b4b64a2..292e4d904ab6 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -23,25 +23,16 @@ /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { - struct smc_link *lnk; struct socket *tcp; if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); - /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; - lnk = smc->conn.lnk; - if (!smc->use_fallback && smc->first_contact_local && - lnk) { - smc_clcsock_put(lnk->clcsock); - goto out; - } sock_release(tcp); } -out: mutex_unlock(&smc->clcsock_release_lock); } @@ -242,8 +233,7 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk && - !smc->first_contact_local) { + if (smc->clcsock && smc->clcsock->sk) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 539700911315..bdc16e8cdee5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -768,7 +768,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ - smc_clcsock_hold(lnk->clcsock); lnk->link_idx = link_idx; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); @@ -823,7 +822,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); - smc_clcsock_put(lnk->clcsock); smc_lgr_put(lgr); /* lgr_hold above */ return rc; } @@ -916,13 +914,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); - lnk->clcsock = kzalloc(sizeof(*lnk->clcsock), GFP_KERNEL); - if (!lnk->clcsock) { - rc = -ENOMEM; - goto free_wq; - } - lnk->clcsock->sock = smc->clcsock; - refcount_set(&lnk->clcsock->refcnt, 1); rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -1262,7 +1253,6 @@ static void __smcr_link_clear(struct smc_link *lnk) smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; - smc_clcsock_put(lnk->clcsock); if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ @@ -1934,9 +1924,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { - /* keep this clcsock for QP reuse */ - if (net->smc.sysctl_keep_first_contact_clcsock) - smc->first_contact_local = 1; rc = smc_lgr_create(smc, ini); if (rc) goto out; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index a695d5bcab3a..35951baf55f9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -87,32 +87,6 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LINKFLAG_ANNOUNCE_PENDING 0 -struct smc_clcsock { - refcount_t refcnt; - struct socket *sock; -}; - -static inline void smc_clcsock_hold(struct smc_clcsock *clcsock) -{ - if (!clcsock) - return; - - refcount_inc(&clcsock->refcnt); -} - -static inline void smc_clcsock_put(struct smc_clcsock *clcsock) -{ - if (!clcsock) - return; - - if (refcount_dec_and_test(&clcsock->refcnt)) { - if (clcsock->sock) - sock_release(clcsock->sock); - clcsock->sock = NULL; - kfree(clcsock); - } -} - struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -193,8 +167,6 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ - - struct smc_clcsock *clcsock; /* keep for eRDMA */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9a5b2880e761..67b8b1595770 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1114,8 +1114,6 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; lnk_new->iw_conn_param = link->iw_conn_param; - lnk_new->clcsock = link->clcsock; - rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; @@ -1487,7 +1485,6 @@ int smc_llc_srv_add_link(struct smc_link *link, } lgr->lnk[lnk_idx].iw_conn_param = link->iw_conn_param; - lgr->lnk[lnk_idx].clcsock = link->clcsock; rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) goto out; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index b2c3dae5543e..bb0103a1e7b2 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -62,15 +62,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "keep_first_contact_clcsock", - .data = &init_net.smc.sysctl_keep_first_contact_clcsock, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; -- Gitee From dd27251b9e1b3692f28b04c3b362f1d3438177c1 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 15:20:04 +0800 Subject: [PATCH 002/148] Revert "net/smc: Multiple CQs per IB devices" This reverts commit 175d7ce9a1c79fade3708cb4a4edeecaf6c94da3. --- net/smc/smc_ib.c | 139 ++++++++++++++++------------------------------- net/smc/smc_ib.h | 6 +- net/smc/smc_wr.c | 18 ++---- 3 files changed, 52 insertions(+), 111 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d33acd85f4c6..cc16377fafa7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -630,36 +630,6 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, - bool is_send) -{ - struct smc_ib_cq *smcibcq, *cq; - int min, i; - - if (is_send) - smcibcq = smcibdev->smcibcq_send; - else - smcibcq = smcibdev->smcibcq_recv; - - cq = smcibcq; - min = cq->load; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibcq[i].load < min) { - cq = &smcibcq[i]; - min = cq->load; - } - } - - cq->load++; - return cq; -} - -static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) -{ - smcibcq->load--; -} - static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -683,11 +653,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) { + if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq_send); - smc_ib_put_cq(lnk->smcibcq_recv); - } lnk->roce_qp = NULL; lnk->smcibcq_send = NULL; lnk->smcibcq_recv = NULL; @@ -696,16 +663,12 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, - true); - struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, - false); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq_send->ib_cq, - .recv_cq = smcibcq_recv->ib_cq, + .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, + .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -734,8 +697,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = smcibcq_send; - lnk->smcibcq_recv = smcibcq_recv; + lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; + lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; smc_wr_remember_qp_attr(lnk); } return rc; @@ -856,26 +819,20 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibdev->smcibcq_send[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); - - if (smcibdev->smcibcq_recv[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); - } + ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); + kfree(smcibdev->ib_cq_send); + smcibdev->ib_cq_send = NULL; - kfree(smcibdev->smcibcq_send); - kfree(smcibdev->smcibcq_recv); + ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); + kfree(smcibdev->ib_cq_recv); + smcibdev->ib_cq_recv = NULL; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; - struct smc_ib_cq *smcibcq; - int i, num_cq_peer; long rc; mutex_lock(&smcibdev->mutex); @@ -887,53 +844,49 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq_peer = num_cq_peer; - smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_send) { + smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); + if (!smcibcq_send) { rc = -ENOMEM; - goto err; + goto out; } - smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_recv) { - rc = -ENOMEM; - goto err; + smcibcq_send->smcibdev = smcibdev; + smcibcq_send->is_send = 1; + cqattr.comp_vector = 0; + smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq_send, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); + if (IS_ERR(smcibdev->ib_cq_send)) { + smcibdev->ib_cq_send = NULL; + goto out; } + smcibdev->ib_cq_send = smcibcq_send; - /* initialize CQs */ - for (i = 0; i < num_cq_peer; i++) { - /* initialize send CQ */ - smcibcq = &smcibdev->smcibcq_send[i]; - smcibcq->smcibdev = smcibdev; - smcibcq->is_send = 1; - cqattr.comp_vector = i; - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) - goto err; - - /* initialize recv CQ */ - smcibcq = &smcibdev->smcibcq_recv[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) - goto err; + smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); + if (!smcibcq_recv) { + rc = -ENOMEM; + goto err_send; + } + smcibcq_recv->smcibdev = smcibdev; + cqattr.comp_vector = 1; + smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq_recv, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); + if (IS_ERR(smcibdev->ib_cq_recv)) { + smcibdev->ib_cq_recv = NULL; + goto err_recv; } + smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err: - smc_ib_cleanup_cq(smcibdev); +err_recv: + kfree(smcibcq_recv); + ib_destroy_cq(smcibcq_send->ib_cq); +err_send: + kfree(smcibcq_send); out: mutex_unlock(&smcibdev->mutex); return rc; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 1af83b5a2e7e..9b24033e20e4 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -37,7 +37,6 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ bool is_send; /* send for recv cq */ - int load; /* load of current cq */ }; struct smc_ib_device { /* ib-device infos for smc */ @@ -45,9 +44,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq_peer; /* num of snd/rcv cq peer */ - struct smc_ib_cq *smcibcq_send; /* send cqs */ - struct smc_ib_cq *smcibcq_recv; /* recv cqs */ + struct smc_ib_cq *ib_cq_send; /* send completion queue */ + struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 5c2d30417346..327dd8ee3590 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -842,24 +842,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); - tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); - } + tasklet_kill(&smcibdev->ib_cq_recv->tasklet); + tasklet_kill(&smcibdev->ib_cq_send->tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, - smc_wr_tx_tasklet_fn); - tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, - smc_wr_rx_tasklet_fn); - } + tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 586a10cebb9ec12acaf1f6c8436a98472691b9b0 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 15:20:13 +0800 Subject: [PATCH 003/148] Revert "net/smc: Introduce smc_ib_cq to bind link and cq" This reverts commit e8caae27be0141c25cd433682ac98c564a6caa1c. --- net/smc/smc_core.h | 2 -- net/smc/smc_ib.c | 86 ++++++++++++++-------------------------------- net/smc/smc_ib.h | 13 +++---- net/smc/smc_wr.c | 32 ++++++++--------- 4 files changed, 45 insertions(+), 88 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35951baf55f9..5849a98c7f6e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,8 +94,6 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq_recv; /* cq for recv */ - struct smc_ib_cq *smcibcq_send; /* cq for send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index cc16377fafa7..9d55173d474f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -656,8 +656,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; - lnk->smcibcq_send = NULL; - lnk->smcibcq_recv = NULL; } /* create a queue pair within the protection domain for a link */ @@ -667,8 +665,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, - .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -694,13 +692,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) { + if (IS_ERR(lnk->roce_qp)) lnk->roce_qp = NULL; - } else { - lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; - lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; + else smc_wr_remember_qp_attr(lnk); - } return rc; } @@ -817,21 +812,10 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } -static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) -{ - ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); - kfree(smcibdev->ib_cq_send); - smcibdev->ib_cq_send = NULL; - - ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); - kfree(smcibdev->ib_cq_recv); - smcibdev->ib_cq_recv = NULL; -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; - struct smc_ib_cq *smcibcq_send, *smcibcq_recv; + struct ib_cq_init_attr cqattr = { + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; int cqe_size_order, smc_order; long rc; @@ -844,49 +828,28 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); - if (!smcibcq_send) { - rc = -ENOMEM; - goto out; - } - smcibcq_send->smcibdev = smcibdev; - smcibcq_send->is_send = 1; - cqattr.comp_vector = 0; - smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq_send, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); - if (IS_ERR(smcibdev->ib_cq_send)) { - smcibdev->ib_cq_send = NULL; + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; goto out; } - smcibdev->ib_cq_send = smcibcq_send; - - smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); - if (!smcibcq_recv) { - rc = -ENOMEM; - goto err_send; - } - smcibcq_recv->smcibdev = smcibdev; - cqattr.comp_vector = 1; - smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq_recv, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); - if (IS_ERR(smcibdev->ib_cq_recv)) { - smcibdev->ib_cq_recv = NULL; - goto err_recv; + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; } - smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err_recv: - kfree(smcibcq_recv); - ib_destroy_cq(smcibcq_send->ib_cq); -err_send: - kfree(smcibcq_send); +err: + ib_destroy_cq(smcibdev->roce_cq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -898,7 +861,8 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - smc_ib_cleanup_cq(smcibdev); + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 9b24033e20e4..5d8b49c57f50 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,20 +32,15 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ -struct smc_ib_cq { /* ib_cq wrapper for smc */ - struct smc_ib_device *smcibdev; /* parent ib device */ - struct ib_cq *ib_cq; /* real ib_cq for link */ - struct tasklet_struct tasklet; /* tasklet for wr */ - bool is_send; /* send for recv cq */ -}; - struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct smc_ib_cq *ib_cq_send; /* send completion queue */ - struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 327dd8ee3590..8384c4306c7d 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -136,7 +136,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -145,9 +145,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, + ib_req_notify_cq(dev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); } @@ -162,9 +162,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; - tasklet_schedule(&smcibcq->tasklet); + tasklet_schedule(&dev->send_tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -327,7 +327,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -371,7 +371,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -486,7 +486,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -495,9 +495,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, + ib_req_notify_cq(dev->roce_cq_recv, IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); } @@ -511,9 +511,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; - tasklet_schedule(&smcibcq->tasklet); + tasklet_schedule(&dev->recv_tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -842,14 +842,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->ib_cq_recv->tasklet); - tasklet_kill(&smcibdev->ib_cq_send->tasklet); + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 87fe336b7ead6d3b9555dee23f5043c51a086407 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 15:20:24 +0800 Subject: [PATCH 004/148] Revert "anolis: net/smc: Introduce link-related proc file" This reverts commit ae75cf9d2eb7804ea91d51cc469cd47bbc353bca. --- net/smc/smc_proc.c | 58 +++------------------------------------------- net/smc/smc_proc.h | 10 ++++---- 2 files changed, 7 insertions(+), 61 deletions(-) diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index 106887b7b9e1..19d8cc82a7ac 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -154,11 +154,9 @@ static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, - smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); + lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); } else { - seq_puts(seq, "- - - - - - -" - " - - -\n"); + seq_puts(seq, "- - - - - - - -\n"); } } @@ -172,7 +170,7 @@ static int smc_conn_show(struct seq_file *seq, void *v) seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); + "l_qp", "r_qp", "tx_cnt", "rx_cnt"); goto out; } @@ -236,51 +234,6 @@ static struct smc_proc_entry smc_proc[] = { #endif }; -extern struct smc_lgr_list smc_lgr_list; -static int proc_show_links(struct seq_file *seq, void *v) -{ - struct smc_link_group *lgr, *lg; - struct smc_link *lnk; - int i = 0, j = 0; - - seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", - "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", - "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lnk = &lgr->lnk[i]; - if (!smc_link_usable(lnk)) - continue; - for (j = 0; j < SMC_LGR_ID_SIZE; j++) - seq_printf(seq, "%02X", lgr->id[j]); - seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", - lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, - lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, - lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, - lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, - atomic_read(&lnk->local_rq_credits), - atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, - lnk->peer_cr_watermark_low, lnk->flags); - } - } - spin_unlock_bh(&smc_lgr_list.lock); - return 0; -} - -static int proc_open_links(struct inode *inode, struct file *file) -{ - single_open(file, proc_show_links, NULL); - return 0; -} - -static struct proc_ops link_file_ops = { -.proc_open = proc_open_links, -.proc_read = seq_read, -.proc_release = single_release, -}; - static int __net_init smc_proc_dir_init(struct net *net) { int i, rc = -ENOMEM; @@ -297,9 +250,6 @@ static int __net_init smc_proc_dir_init(struct net *net) goto err_entry; } - if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) - goto err_entry; - return 0; err_entry: @@ -315,8 +265,6 @@ static void __net_exit smc_proc_dir_exit(struct net *net) { int i; - remove_proc_entry("links", net->proc_net_smc); - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) remove_proc_entry(smc_proc[i].name, net->proc_net_smc); diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h index faa5eaaee511..ec59ca03e163 100644 --- a/net/smc/smc_proc.h +++ b/net/smc/smc_proc.h @@ -9,14 +9,12 @@ #include #include "smc.h" -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ - "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ - "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") #define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") #define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") struct smc_proc_private { struct seq_net_private p; -- Gitee From 2f4ca075d48906070088e3daf05ec20179cf79f5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 15:20:42 +0800 Subject: [PATCH 005/148] Revert "anolis: net/smc: Support rq flow control in smc-r link layer" This reverts commit d1bf84efe845661efe1c4000da01b24e8fef465a. --- net/smc/af_smc.c | 12 ------ net/smc/smc_cdc.c | 12 +----- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 -- net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 +-------- net/smc/smc_ib.c | 6 +-- net/smc/smc_llc.c | 92 +--------------------------------------------- net/smc/smc_llc.h | 5 --- net/smc/smc_wr.c | 31 +++------------- net/smc/smc_wr.h | 54 +-------------------------- 11 files changed, 15 insertions(+), 223 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 94b94601bd1c..314cb7a4b090 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -623,13 +623,6 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; - link->credits_enable = clc->r0.init_credits ? 1 : 0; - if (link->credits_enable) { - atomic_set(&link->peer_rq_credits, clc->r0.init_credits); - // set peer rq credits watermark, if less than init_credits * 2/3, - // then credit announcement is needed. - link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); - } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -1173,11 +1166,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { - if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { - reason_code = SMC_CLC_DECL_CREDITSERR; - goto connect_abort; - } - if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGRMB; goto connect_abort; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 7727a8fdca0f..fe72e416d926 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -106,30 +106,25 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; - struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; - u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); - cdc_msg->credits = saved_credits; + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (likely(!rc)) { + if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -441,9 +436,6 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ - if (cdc->credits) - smc_wr_tx_put_credits(link, cdc->credits, true); - /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 145ce7997e64..696cc11f2303 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,8 +47,7 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 credits; /* credits synced by every cdc msg */ - u8 reserved[17]; + u8 reserved[18]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3180c8500c5f..ce27399b38b1 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1038,12 +1038,9 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; - clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); - clc->r0.init_credits = - link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index eb4bba54d6df..83f02f131fc0 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,7 +63,6 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ -#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -191,7 +190,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved; __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5849a98c7f6e..35a85ec08919 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,12 +21,7 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, - * SMC_WR_BUF_CNT should not be less than 2 * - * SMC_RMBS_PER_LGR_MAX, since every connection at - * least has two rq/sq credits in average, otherwise - * may result in waiting for credits in sending process. - */ +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -85,8 +80,6 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 -#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 - struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -131,14 +124,6 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ - atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ - atomic_t local_rq_credits; /* credits for local rq flowctrl */ - u8 credits_enable; /* credits enable flag, set when negotiation */ - u8 local_cr_watermark_high; /* local rq credits watermark */ - u8 peer_cr_watermark_low; /* peer rq credits watermark */ - struct work_struct credits_announce_work; /* work for credits announcement */ - unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ - u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9d55173d474f..8e2b1af1d291 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -670,12 +670,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND. - * RDMA_WRITE consumes send queue entities, - * without recv queue entities. + * there are max. 2 RDMA_WRITE per 1 WR_SEND */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT, + .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, }, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 67b8b1595770..1d8dafa1a35e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,8 +75,7 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 init_credits; /* QP rq init credits for rq flowctrl */ - u8 reserved[7]; + u8 reserved[8]; }; struct smc_llc_msg_add_link_cont_rt { @@ -171,12 +170,6 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; -struct smc_llc_msg_announce_credits { /* type 0x0A */ - struct smc_llc_hdr hd; - u8 credits; - u8 reserved[39]; -}; - struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -196,7 +189,6 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; - struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -756,46 +748,6 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -/* send credits announce request or response */ -int smc_llc_announce_credits(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool force) -{ - struct smc_llc_msg_announce_credits *announce_credits; - struct smc_wr_tx_pend_priv *pend; - struct smc_wr_buf *wr_buf; - int rc; - u8 saved_credits = 0; - - if (!link->credits_enable || - (!force && !smc_wr_rx_credits_need_announce(link))) - return 0; - - saved_credits = (u8)smc_wr_rx_get_credits(link); - if (!saved_credits) - /* maybe synced by cdc msg */ - return 0; - - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); - if (rc) { - smc_wr_rx_put_credits(link, saved_credits); - return rc; - } - - announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; - memset(announce_credits, 0, sizeof(*announce_credits)); - announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; - announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); - if (reqresp == SMC_LLC_RESP) - announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; - announce_credits->credits = saved_credits; - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - if (rc) - smc_wr_rx_put_credits(link, saved_credits); - - return rc; -} - /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1058,13 +1010,6 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; - link->credits_enable = add_llc->init_credits ? 1 : 0; - if (link->credits_enable) { - atomic_set(&link->peer_rq_credits, add_llc->init_credits); - // set peer rq credits watermark, if less than init_credits * 2/3, - // then credit announcement is needed. - link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); - } } /* as an SMC client, process an add link request */ @@ -1985,10 +1930,6 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; - case SMC_LLC_ANNOUNCE_CREDITS: - if (smc_link_active(link)) - smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); - break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2074,10 +2015,6 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; - case SMC_LLC_ANNOUNCE_CREDITS: - if (smc_link_active(link)) - smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); - break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2171,27 +2108,6 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } -static void smc_llc_announce_credits_work(struct work_struct *work) -{ - struct smc_link *link = container_of(work, - struct smc_link, credits_announce_work); - int rc, retry = 0, agains = 0; - -again: - do { - rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); - } while ((rc == -EBUSY) && smc_link_sendable(link) && - (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); - - if (smc_wr_rx_credits_need_announce(link) && - smc_link_sendable(link) && agains <= 5 && !rc) { - agains++; - goto again; - } - - clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); -} - void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2227,7 +2143,6 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); - INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2259,7 +2174,6 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); - cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2374,10 +2288,6 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, - { - .handler = smc_llc_rx_handler, - .type = SMC_LLC_ANNOUNCE_CREDITS - }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index f8a14643faf4..4404e52b3346 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,8 +20,6 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) -#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) - enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -37,7 +35,6 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, - SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -89,8 +86,6 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); -int smc_llc_announce_credits(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 8384c4306c7d..ca179e2c86b7 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -130,8 +130,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - if (wq_has_sleeper(&link->wr_tx_wait)) - wake_up(&link->wr_tx_wait); + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) @@ -174,16 +173,11 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; - - if (!smc_wr_tx_get_credit(link)) - return -EBUSY; - for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; - smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -289,7 +283,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - smc_wr_tx_put_credits(link, 1, true); + wake_up(&link->wr_tx_wait); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -475,12 +469,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } - - if (smc_wr_rx_credits_need_announce(link) && - !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { - set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); - schedule_work(&link->credits_announce_work); - } } } @@ -523,8 +511,6 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); - // credits have already been announced to peer - atomic_set(&link->local_rq_credits, 0); return rc; } @@ -559,7 +545,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, lnk->qp_attr.cap.max_recv_wr); } @@ -748,7 +734,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -756,7 +742,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -775,7 +761,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -898,11 +884,6 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); - atomic_set(&lnk->peer_rq_credits, 0); - atomic_set(&lnk->local_rq_credits, 0); - lnk->flags = 0; - lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); - lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 8cf276215c91..a54e90a1110f 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,12 +19,7 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT - * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, - * since every connection at least has two rq/sq - * credits in average, otherwise may result in - * waiting for credits in sending process. - */ +#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -88,51 +83,6 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } -// get one tx credit, and peer rq credits dec -static inline int smc_wr_tx_get_credit(struct smc_link *link) -{ - return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; -} - -// put tx credits, when some failures occurred after tx credits got -// or receive announce credits msgs -static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) -{ - if (link->credits_enable && credits) { - atomic_add(credits, &link->peer_rq_credits); - if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) - wake_up_nr(&link->wr_tx_wait, credits); - } -} - -// to check whether peer rq credits is lower than watermark. -static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) -{ - return link->credits_enable && - atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; -} - -// get local rq credits and set credits to zero. -// may called when announcing credits -static inline int smc_wr_rx_get_credits(struct smc_link *link) -{ - return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; -} - -// called when post_recv a rqe -static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) -{ - if (link->credits_enable && credits) - atomic_add(credits, &link->local_rq_credits); -} - -// to check whether local rq credits is higher than watermark. -static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) -{ - return link->credits_enable && - atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; -} - /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -145,8 +95,6 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); - if (!rc) - smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From 530bb6a7912fccb2282dc0b9e6c166b9f5dc3b28 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 15:20:58 +0800 Subject: [PATCH 006/148] Revert "anolis: net/smc: support auto-cork with nagle algorithm" This reverts commit 2418510c088f5eeb4188d6ddd7a540867ed739ca. --- net/smc/af_smc.c | 2 -- net/smc/smc.h | 2 -- net/smc/smc_cdc.c | 11 ++---- net/smc/smc_sysctl.c | 9 ----- net/smc/smc_tx.c | 83 ++++---------------------------------------- 5 files changed, 9 insertions(+), 98 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 314cb7a4b090..bad49e469eac 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3160,7 +3160,6 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 0; - net->smc.sysctl_autocorking = 1; } return smc_pnet_net_init(net); @@ -3307,7 +3306,6 @@ static int __init smc_init(void) init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_allow_different_subnet = 0; - init_net.smc.sysctl_autocorking = 1; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc.h b/net/smc/smc.h index 770640a0146b..3364f055f042 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,8 +188,6 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ - atomic_t tx_pushing; /* nr_threads trying tx push */ - struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index fe72e416d926..9d5a97168969 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,14 +48,9 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { - /* If this is the last pending WR complete, push them to prevent - * no one trying to push when corked. - */ - smc_tx_sndbuf_nonempty(conn); - if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); - } + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && + unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index bb0103a1e7b2..583a9457b47f 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -53,15 +53,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "autocorking", - .data = &init_net.smc.sysctl_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index dac3f9634fd4..927602df80a0 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -124,39 +124,6 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) return rc; } -/* Strategy: Nagle algorithm - * 1. The first message should never cork - * 2. If we have any inflight messages, wait for the first - * message back - * 3. The total corked message should not exceed min(64k, sendbuf/2) - */ -static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) -{ - struct smc_connection *conn = &smc->conn; - int prepared_send; - - /* First request && no more message should always pass */ - if (atomic_read(&conn->cdc_pend_tx_wr) == 0 && - !(msg->msg_flags & MSG_MORE)) - return false; - - /* If We have enough data in the send queue that have not been - * pushed, send immediately. - * Note, here we only care about the prepared_sends, but not - * sendbuf_space because sendbuf_space has nothing to do with - * corked data size. - */ - prepared_send = smc_tx_prepared_sends(conn); - if (prepared_send > min(64 * 1024, conn->sndbuf_desc->len >> 1)) - return false; - - if (!sock_net(&smc->sk)->smc.sysctl_autocorking) - return false; - - /* All the other conditions should cork */ - return true; -} - static bool smc_tx_is_corked(struct smc_sock *smc) { struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); @@ -210,13 +177,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; - /* If our send queue is full but peer have RMBE space, - * we should send them out before wait - */ - if (!atomic_read(&conn->sndbuf_space) && - atomic_read(&conn->peer_rmbe_space) > 0) - smc_tx_sndbuf_nonempty(conn); - if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { @@ -276,10 +236,9 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if (((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - (atomic_read(&conn->sndbuf_space))) || - smc_tx_should_cork(smc, msg)) { + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + (atomic_read(&conn->sndbuf_space))) { /* for a corked socket defer the RDMA writes if * sndbuf_space is still available. The applications * should known how/when to uncork it. @@ -640,31 +599,11 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc = 0; - struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - - /* Only let one to push to prevent wasting of CPU and CDC slot */ - if (atomic_inc_return(&conn->tx_pushing) > 1) - return 0; - -again: - atomic_set(&conn->tx_pushing, 1); - - /* No data in the send queue */ - if (unlikely(smc_tx_prepared_sends(conn) <= 0)) - goto out; - - /* Peer don't have RMBE space */ - if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { - SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); - goto out; - } + int rc; if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { - rc = -EPIPE; /* connection being aborted */ - goto out; - } + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -676,16 +615,6 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) conn); smc_close_wake_tx_prepared(smc); } - -out: - /* We need to check whether someone else have added some data into - * the send queue and tried to push but failed when we are pushing. - * If so, we need to try push again to prevent those data in the - * send queue may never been pushed out - */ - if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) - goto again; - return rc; } -- Gitee From c34348eab08ca0e62123f86c4645e8abcc9d12ae Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:39:02 +0800 Subject: [PATCH 007/148] Revert "net/smc: Cork when sendpage with MSG_SENDPAGE_NOTLAST flag" This reverts commit 56fd35a4e3c357ecb9867b8f3b89c8be2b0f59af. --- net/smc/af_smc.c | 4 +--- net/smc/smc_tx.c | 19 +------------------ net/smc/smc_tx.h | 2 -- 3 files changed, 2 insertions(+), 23 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bad49e469eac..3db18f0ebb7a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2948,10 +2948,8 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); } else { - lock_sock(sk); - rc = smc_tx_sendpage(smc, page, offset, size, flags); - release_sock(sk); SMC_STAT_INC(smc, sendpage_cnt); + rc = sock_no_sendpage(sock, page, offset, size, flags); } out: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 927602df80a0..e223f494e1cc 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -236,8 +236,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && (atomic_read(&conn->sndbuf_space))) { /* for a corked socket defer the RDMA writes if * sndbuf_space is still available. The applications @@ -265,22 +264,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return rc; } -int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, - size_t size, int flags) -{ - struct msghdr msg = {.msg_flags = flags}; - char *kaddr = kmap(page); - struct kvec iov; - int rc; - - iov.iov_base = kaddr + offset; - iov.iov_len = size; - iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); - rc = smc_tx_sendmsg(smc, &msg, size); - kunmap(page); - return rc; -} - /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 34b578498b1f..a59f370b8b43 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -31,8 +31,6 @@ void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); -int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, - size_t size, int flags); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); -- Gitee From d65a4c57380082e1b08777cd4483dfb165848f2b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:57:02 +0800 Subject: [PATCH 008/148] Revert "net/smc: Remove corked dealyed work" This reverts commit 2381e8429e19b210da09274efdc17d414f23ee09. --- net/smc/smc_tx.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index e223f494e1cc..d18717879d6a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -31,6 +31,7 @@ #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -237,20 +238,21 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space))) { - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_desc->len >> 1))) { + /* for a corked socket defer the RDMA writes if there + * is still sufficient sndbuf_space available */ conn->tx_corked_bytes += copylen; ++conn->tx_corked_cnt; - continue; + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_CORK_DELAY); + } else { + conn->tx_bytes += copylen; + ++conn->tx_cnt; + smc_tx_sndbuf_nonempty(conn); } - conn->tx_bytes += copylen; - ++conn->tx_cnt; - smc_tx_sndbuf_nonempty(conn); - trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From 79aaf5bae2576467220b8d90c1d99f39224ad8ce Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:57:37 +0800 Subject: [PATCH 009/148] Revert "net/smc: Send directly when TCP_CORK is cleared" This reverts commit 772cce3cd71cfe80c9772aba50d6c9d89fee65c5. --- net/smc/af_smc.c | 4 ++-- net/smc/smc_tx.c | 25 ++++++++++--------------- net/smc/smc_tx.h | 1 - 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3db18f0ebb7a..27f9becf2a0e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2806,8 +2806,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_CLOSED) { if (!val) { SMC_STAT_INC(smc, cork_cnt); - smc_tx_pending(&smc->conn); - cancel_delayed_work(&smc->conn.tx_work); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); } } break; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index d18717879d6a..82735741bc2a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -603,20 +603,6 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -void smc_tx_pending(struct smc_connection *conn) -{ - struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - int rc; - - if (smc->sk.sk_err) - return; - - rc = smc_tx_sndbuf_nonempty(conn); - if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && - !atomic_read(&conn->bytes_to_rcv)) - conn->local_rx_ctrl.prod_flags.write_blocked = 0; -} - /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ @@ -626,9 +612,18 @@ void smc_tx_work(struct work_struct *work) struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc; lock_sock(&smc->sk); - smc_tx_pending(conn); + if (smc->sk.sk_err) + goto out; + + rc = smc_tx_sndbuf_nonempty(conn); + if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && + !atomic_read(&conn->bytes_to_rcv)) + conn->local_rx_ctrl.prod_flags.write_blocked = 0; + +out: release_sock(&smc->sk); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index a59f370b8b43..07e6ad76224a 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -27,7 +27,6 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } -void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); -- Gitee From 13c85e5600eb8347e4bd73d5c0d30a55af0f02ed Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:57:47 +0800 Subject: [PATCH 010/148] Revert "Revert "anolis: net/smc: support auto-cork with nagle algorithm"" This reverts commit 37b90e40a7f558b4ebe0a73609b38f2db0add0d3. --- net/smc/af_smc.c | 24 ++----------- net/smc/smc.h | 2 ++ net/smc/smc_cdc.c | 11 ++++-- net/smc/smc_sysctl.c | 9 +++++ net/smc/smc_tx.c | 86 +++++++++++++++++++++++++++++++++++++------- 5 files changed, 94 insertions(+), 38 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 27f9becf2a0e..f26fc7df6b9d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2789,28 +2789,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, rc = -EINVAL; } break; - case TCP_NODELAY: - if (sk->sk_state != SMC_INIT && - sk->sk_state != SMC_LISTEN && - sk->sk_state != SMC_CLOSED) { - if (val) { - SMC_STAT_INC(smc, ndly_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); - } - } - break; - case TCP_CORK: - if (sk->sk_state != SMC_INIT && - sk->sk_state != SMC_LISTEN && - sk->sk_state != SMC_CLOSED) { - if (!val) { - SMC_STAT_INC(smc, cork_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); - } - } - break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; @@ -3158,6 +3136,7 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 0; + net->smc.sysctl_autocorking = 1; } return smc_pnet_net_init(net); @@ -3304,6 +3283,7 @@ static int __init smc_init(void) init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_allow_different_subnet = 0; + init_net.smc.sysctl_autocorking = 1; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc.h b/net/smc/smc.h index 3364f055f042..770640a0146b 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,6 +188,8 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 9d5a97168969..fe72e416d926 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,9 +48,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && - unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If this is the last pending WR complete, push them to prevent + * no one trying to push when corked. + */ + smc_tx_sndbuf_nonempty(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 583a9457b47f..3f06b3986a99 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -44,6 +44,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "autocorking", + .data = &init_net.smc.sysctl_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "limit_handshake", .data = &init_net.smc.limit_smc_hs, diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 82735741bc2a..68d62ac63dec 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -31,7 +31,6 @@ #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 -#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -125,11 +124,37 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) return rc; } -static bool smc_tx_is_corked(struct smc_sock *smc) +/* Strategy: Nagle algorithm + * 1. The first message should never cork + * 2. If we have any inflight messages, wait for the first + * message back + * 3. The total corked message should not exceed min(64k, sendbuf/2) + */ +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) { - struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); - - return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; + struct smc_connection *conn = &smc->conn; + int prepared_send; + + /* First request && no more message should always pass */ + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 && + !(msg->msg_flags & MSG_MORE)) + return false; + + /* If We have enough data in the send queue that have not been + * pushed, send immediately. + * Note, here we only care about the prepared_sends, but not + * sendbuf_space because sendbuf_space has nothing to do with + * corked data size. + */ + prepared_send = smc_tx_prepared_sends(conn); + if (prepared_send > min(64 * 1024, conn->sndbuf_desc->len >> 1)) + return false; + + if (!sock_net(&smc->sk)->smc.sysctl_autocorking) + return false; + + /* All the other conditions should cork */ + return true; } /* sndbuf producer: main API called by socket layer. @@ -178,6 +203,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; + /* If our send queue is full but peer have RMBE space, + * we should send them out before wait + */ + if (!atomic_read(&conn->sndbuf_space) && + atomic_read(&conn->peer_rmbe_space) > 0) + smc_tx_sndbuf_nonempty(conn); + if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { @@ -237,19 +269,17 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) { + if (smc_tx_should_cork(smc, msg)) { /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ conn->tx_corked_bytes += copylen; ++conn->tx_corked_cnt; - queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, - SMC_TX_CORK_DELAY); } else { conn->tx_bytes += copylen; ++conn->tx_cnt; + if (delayed_work_pending(&conn->tx_work)) + cancel_delayed_work(&conn->tx_work); smc_tx_sndbuf_nonempty(conn); } @@ -584,11 +614,31 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + int rc = 0; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + /* Only let one to push to prevent wasting of CPU and CDC slot */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -600,6 +650,16 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) conn); smc_close_wake_tx_prepared(smc); } + +out: + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed when we are pushing. + * If so, we need to try push again to prevent those data in the + * send queue may never been pushed out + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } -- Gitee From faad0927c595bde80340e2d38824b011f521e919 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:57:57 +0800 Subject: [PATCH 011/148] Revert "net/smc: Add sysctl conrtol for handshake limiation" This reverts commit 2c860e4c4e7cb0becf0ec419921aa8878b4b0a80. --- net/smc/smc_sysctl.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 3f06b3986a99..7f4e0912dd97 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -53,15 +53,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "limit_handshake", - .data = &init_net.smc.limit_smc_hs, - .maxlen = sizeof(bool), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; -- Gitee From 30ae1bf3d6ad3f8e1bb01f8a4492b4ae86aceb21 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:04 +0800 Subject: [PATCH 012/148] Revert "net/smc: Avoid overwriting the copies of clcsock callback functions" This reverts commit c84450c8d9742f11167c29bdeacd426de2bee4c5. --- net/smc/af_smc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f26fc7df6b9d..d1aa4cbb6c75 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -759,17 +759,14 @@ static void smc_fback_error_report(struct sock *clcsk) static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { struct sock *clcsk; - int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { - rc = -EBADF; - goto out; + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; } clcsk = smc->clcsock->sk; - if (smc->use_fallback) - goto out; smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -797,9 +794,8 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } -out: mutex_unlock(&smc->clcsock_release_lock); - return rc; + return 0; } /* fall back during connect */ -- Gitee From 5336cb3c98b0f41c1cd4e789882178765d2b6094 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:12 +0800 Subject: [PATCH 013/148] Revert "net/smc: Add global configure for handshake limitation by netlink" This reverts commit e0020b5b4c3ddc2eabcada51bde020ad6ae5acdd. --- include/net/netns/smc.h | 1 - include/uapi/linux/smc.h | 11 ----------- net/smc/af_smc.c | 42 ---------------------------------------- net/smc/smc.h | 6 ------ net/smc/smc_netlink.c | 15 -------------- net/smc/smc_pnet.c | 3 --- 6 files changed, 78 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index a5c86e4402a6..c531cb2aac8b 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -28,7 +28,6 @@ struct netns_smc { int sysctl_tcp2smc; int sysctl_autocorking; int sysctl_allow_different_subnet; - bool limit_smc_hs; /* constraint on handshake */ }; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 41a446b379a0..496b1a603642 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -62,9 +62,6 @@ enum { SMC_NETLINK_ADD_TCP2SMC_WLIST, SMC_NETLINK_DEL_TCP2SMC_WLIST, SMC_NETLINK_GET_TCP2SMC_WLIST, - SMC_NETLINK_DUMP_HS_LIMITATION, - SMC_NETLINK_ENABLE_HS_LIMITATION, - SMC_NETLINK_DISABLE_HS_LIMITATION, }; /* SMC_GENL_FAMILY top level attributes */ @@ -289,14 +286,6 @@ enum { SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; -/* SMC_NETLINK_HS_LIMITATION attributes */ -enum { - SMC_NLA_HS_LIMITATION_UNSPEC, - SMC_NLA_HS_LIMITATION_ENABLED, /* u8 */ - __SMC_NLA_HS_LIMITATION_MAX, - SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1 -}; - /* SMC socket options */ #define SMC_LIMIT_HS 1 /* constraint on smc handshake */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d1aa4cbb6c75..8720920e48b9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -69,45 +69,6 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); - void *hdr; - - if (cb_ctx->pos[0]) - goto out; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &smc_gen_nl_family, NLM_F_MULTI, - SMC_NETLINK_DUMP_HS_LIMITATION); - if (!hdr) - return -ENOMEM; - - if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, - sock_net(skb->sk)->smc.limit_smc_hs)) - goto err; - - genlmsg_end(skb, hdr); - cb_ctx->pos[0] = 1; -out: - return skb->len; -err: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) -{ - sock_net(skb->sk)->smc.limit_smc_hs = true; - return 0; -} - -int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) -{ - sock_net(skb->sk)->smc.limit_smc_hs = false; - return 0; -} - static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); @@ -3032,9 +2993,6 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; - /* default behavior from limit_smc_hs in every net namespace */ - smc->limit_smc_hs = net->smc.limit_smc_hs; - rc = 0; if (!clcsock) { rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, diff --git a/net/smc/smc.h b/net/smc/smc.h index 770640a0146b..cbf24e64b736 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -14,7 +14,6 @@ #include #include #include /* __aligned */ -#include #include #include "smc_ib.h" @@ -346,9 +345,4 @@ int smc_sysctl_init(void); void smc_sysctl_exit(void); #endif -/* smc handshake limitation interface for netlink */ -int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); -int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); -int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); - #endif /* __SMC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index e1c7ca925cee..f2007aa124cf 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -127,21 +127,6 @@ static const struct genl_ops smc_gen_nl_ops[] = { /* can be retrieved by unprivileged users */ .dumpit = smc_nl_get_tcp2smc_wlist, }, - { - .cmd = SMC_NETLINK_DUMP_HS_LIMITATION, - /* can be retrieved by unprivileged users */ - .dumpit = smc_nl_dump_hs_limitation, - }, - { - .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION, - .flags = GENL_ADMIN_PERM, - .doit = smc_nl_enable_hs_limitation, - }, - { - .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION, - .flags = GENL_ADMIN_PERM, - .doit = smc_nl_disable_hs_limitation, - }, }; static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 4ded23241c20..13df00306182 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -867,9 +867,6 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); - /* disable handshake limitation by default */ - net->smc.limit_smc_hs = 0; - return 0; } -- Gitee From 840c51e6c94496fc9a8758bf04bc396d0801a84e Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:20 +0800 Subject: [PATCH 014/148] Revert "net/smc: Dynamic control handshake limitation by socket options" This reverts commit ca80ec99b4a0965a638a94ffbae5e3a05398bd8d. --- include/linux/socket.h | 2 -- include/uapi/linux/smc.h | 4 --- net/smc/af_smc.c | 69 +--------------------------------------- net/smc/smc.h | 1 - 4 files changed, 1 insertion(+), 75 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index fee0fdcd63c2..9aa530d497da 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -361,8 +361,6 @@ struct ucred { #define SOL_TLS 282 #define SOL_XDP 283 -#define SOL_SMC 286 - /* IPX options */ #define IPX_TYPE 1 diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 496b1a603642..b69bd17f6a52 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -285,8 +285,4 @@ enum { __SMC_NLA_SEID_TABLE_MAX, SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; - -/* SMC socket options */ -#define SMC_LIMIT_HS 1 /* constraint on smc handshake */ - #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8720920e48b9..ec9d8e11db60 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2336,8 +2336,7 @@ static int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; - if (smc->limit_smc_hs) - tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; rc = kernel_listen(smc->clcsock, backlog); if (rc) { @@ -2632,67 +2631,6 @@ static int smc_shutdown(struct socket *sock, int how) return rc ? rc : rc1; } -static int __smc_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct smc_sock *smc; - int val, len; - - smc = smc_sk(sock->sk); - - if (get_user(len, optlen)) - return -EFAULT; - - len = min_t(int, len, sizeof(int)); - - if (len < 0) - return -EINVAL; - - switch (optname) { - case SMC_LIMIT_HS: - val = smc->limit_smc_hs; - break; - default: - return -EOPNOTSUPP; - } - - if (put_user(len, optlen)) - return -EFAULT; - if (copy_to_user(optval, &val, len)) - return -EFAULT; - - return 0; -} - -static int __smc_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - struct smc_sock *smc; - int val, rc; - - smc = smc_sk(sk); - - lock_sock(sk); - switch (optname) { - case SMC_LIMIT_HS: - if (optlen < sizeof(int)) - return -EINVAL; - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; - - smc->limit_smc_hs = !!val; - rc = 0; - break; - default: - rc = -EOPNOTSUPP; - break; - } - release_sock(sk); - - return rc; -} - static int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -2702,8 +2640,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; - else if (level == SOL_SMC) - return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); @@ -2764,9 +2700,6 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, struct smc_sock *smc; int rc; - if (level == SOL_SMC) - return __smc_getsockopt(sock, level, optname, optval, optlen); - smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { diff --git a/net/smc/smc.h b/net/smc/smc.h index cbf24e64b736..440231b8adc3 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -253,7 +253,6 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ - bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ -- Gitee From 20c1ff83e0209073e397e85f1bb158cf007ee0a5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:28 +0800 Subject: [PATCH 015/148] Revert "net/smc: Limit SMC visits when handshake workqueue congested" This reverts commit 5e65028ef5fb62e412665ac5f2c3bca6446b3e08. --- include/linux/tcp.h | 1 - net/ipv4/tcp_input.c | 3 +-- net/smc/af_smc.c | 17 ----------------- 3 files changed, 1 insertion(+), 20 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 66177c5e27c9..2f87377e9af7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -394,7 +394,6 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) - bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6c7a8d0bf4fe..b71bdda39991 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6645,8 +6645,7 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && - tcp_sk(sk)->smc_hs_congested(sk)); + ireq->smc_ok = rx_opt->smc_ok; #endif } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ec9d8e11db60..18c9c40628cf 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -106,21 +106,6 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, return NULL; } -static bool smc_hs_congested(const struct sock *sk) -{ - const struct smc_sock *smc; - - smc = smc_clcsock_user_data(sk); - - if (!smc) - return true; - - if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) - return true; - - return false; -} - static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -2336,8 +2321,6 @@ static int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; - tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; - rc = kernel_listen(smc->clcsock, backlog); if (rc) { smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; -- Gitee From ed30f275adf512c99a74ec810013a190f64dc697 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:36 +0800 Subject: [PATCH 016/148] Revert "net/smc: Limit backlog connections" This reverts commit f2622fba3fa63100eaae100b2cacbe0604d739b0. --- net/smc/af_smc.c | 45 --------------------------------------------- net/smc/smc.h | 6 +----- 2 files changed, 1 insertion(+), 50 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 18c9c40628cf..b324a5ee2089 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -76,36 +76,6 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } -static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct smc_sock *smc; - - smc = smc_clcsock_user_data(sk); - - if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > - sk->sk_max_ack_backlog) - goto drop; - - if (sk_acceptq_is_full(&smc->sk)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - /* passthrough to origin syn recv sock fct */ - return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, - own_req); - -drop: - dst_release(dst); - tcp_listendrop(sk); - return NULL; -} - static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -1632,9 +1602,6 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) - atomic_dec(&lsmc->queued_smc_hs); - if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -2243,9 +2210,6 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) - atomic_inc(&lsmc->queued_smc_hs); - new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2312,15 +2276,6 @@ static int smc_listen(struct socket *sock, int backlog) smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); - - /* save origin ops */ - smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; - - smc->af_ops = *smc->ori_af_ops; - smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; - - inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; - rc = kernel_listen(smc->clcsock, backlog); if (rc) { smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; diff --git a/net/smc/smc.h b/net/smc/smc.h index 440231b8adc3..27f85b2446b3 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -256,10 +256,6 @@ struct smc_sock { /* smc sock container */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ - atomic_t queued_smc_hs; /* queued smc handshakes */ - struct inet_connection_sock_af_ops af_ops; - const struct inet_connection_sock_af_ops *ori_af_ops; - /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value @@ -284,7 +280,7 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) +static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); -- Gitee From 38526840d8551cb7a797be65ad06e2ee1387a829 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:44 +0800 Subject: [PATCH 017/148] Revert "net/smc: Make smc_tcp_listen_work() independent" This reverts commit 4a38221e533eb01e3d50dab8a2f955f7f25a9a63. --- net/smc/af_smc.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b324a5ee2089..a4974d14b54d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -62,7 +62,6 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ -static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ @@ -2238,7 +2237,7 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } } @@ -3022,14 +3021,9 @@ static int __init smc_init(void) goto out_nl; rc = -ENOMEM; - - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); - if (!smc_tcp_ls_wq) - goto out_pnet; - smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) - goto out_alloc_tcp_ls_wq; + goto out_pnet; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) @@ -3133,8 +3127,6 @@ static int __init smc_init(void) destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); -out_alloc_tcp_ls_wq: - destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: @@ -3155,7 +3147,6 @@ static void __exit smc_exit(void) smc_core_exit(); smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); - destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); -- Gitee From af761657474d63d94eec77789ee0ba32838feabc Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:58:53 +0800 Subject: [PATCH 018/148] Revert "net/smc: Forward wakeup to smc socket waitqueue after fallback" This reverts commit 065384cc753428af67421b2597bf3f3d57b08d82. --- net/smc/af_smc.c | 133 ++++++----------------------------------------- net/smc/smc.h | 20 +------ 2 files changed, 16 insertions(+), 137 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a4974d14b54d..9fade0edf56d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -573,115 +573,17 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } -/* must be called under rcu read lock */ -static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) -{ - struct socket_wq *wq; - __poll_t flags; - - wq = rcu_dereference(smc->sk.sk_wq); - if (!skwq_has_sleeper(wq)) - return; - - /* wake up smc sk->sk_wq */ - if (!key) { - /* sk_state_change */ - wake_up_interruptible_all(&wq->wait); - } else { - flags = key_to_poll(key); - if (flags & (EPOLLIN | EPOLLOUT)) - /* sk_data_ready or sk_write_space */ - wake_up_interruptible_sync_poll(&wq->wait, flags); - else if (flags & EPOLLERR) - /* sk_error_report */ - wake_up_interruptible_poll(&wq->wait, flags); - } -} - -static int smc_fback_mark_woken(wait_queue_entry_t *wait, - unsigned int mode, int sync, void *key) -{ - struct smc_mark_woken *mark = - container_of(wait, struct smc_mark_woken, wait_entry); - - mark->woken = true; - mark->key = key; - return 0; -} - -static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, - void (*clcsock_callback)(struct sock *sk)) -{ - struct smc_mark_woken mark = { .woken = false }; - struct socket_wq *wq; - - init_waitqueue_func_entry(&mark.wait_entry, - smc_fback_mark_woken); - rcu_read_lock(); - wq = rcu_dereference(clcsk->sk_wq); - if (!wq) - goto out; - add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); - clcsock_callback(clcsk); - remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); - - if (mark.woken) - smc_fback_wakeup_waitqueue(smc, mark.key); -out: - rcu_read_unlock(); -} - -static void smc_fback_state_change(struct sock *clcsk) -{ - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); - - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); -} - -static void smc_fback_data_ready(struct sock *clcsk) -{ - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); - - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); -} - -static void smc_fback_write_space(struct sock *clcsk) -{ - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); - - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); -} - -static void smc_fback_error_report(struct sock *clcsk) -{ - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); - - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); -} - static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - struct sock *clcsk; + wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); + wait_queue_head_t *clc_wait; + unsigned long flags; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } - clcsk = smc->clcsock->sk; - smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -692,22 +594,16 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* There might be some wait entries remaining - * in smc sk->sk_wq and they should be woken up - * as clcsock's wait queue is woken up. + /* There may be some entries remaining in + * smc socket->wq, which should be removed + * to clcsocket->wq during the fallback. */ - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_fback_state_change; - clcsk->sk_data_ready = smc_fback_data_ready; - clcsk->sk_write_space = smc_fback_write_space; - clcsk->sk_error_report = smc_fback_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + clc_wait = sk_sleep(smc->clcsock->sk); + spin_lock_irqsave(&smc_wait->lock, flags); + spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); + list_splice_init(&smc_wait->head, &clc_wait->head); + spin_unlock(&clc_wait->lock); + spin_unlock_irqrestore(&smc_wait->lock, flags); } mutex_unlock(&smc->clcsock_release_lock); return 0; @@ -2229,9 +2125,10 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc = - smc_clcsock_user_data(listen_clcsock); + struct smc_sock *lsmc; + lsmc = (struct smc_sock *) + ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); if (!lsmc) return; lsmc->clcsk_data_ready(listen_clcsock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 27f85b2446b3..a7177ec11f87 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -135,12 +135,6 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; -struct smc_mark_woken { - bool woken; - void *key; - wait_queue_entry_t wait_entry; -}; - struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -238,14 +232,8 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ - void (*clcsk_state_change)(struct sock *sk); - /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. */ - void (*clcsk_write_space)(struct sock *sk); - /* original write_space fct. */ - void (*clcsk_error_report)(struct sock *sk); - /* original error_report fct. */ + /* original data_ready fct. **/ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -280,12 +268,6 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) -{ - return (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); -} - extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ -- Gitee From 9f2e44447088eab5b41fe7828768b592e043bb3d Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:04 +0800 Subject: [PATCH 019/148] Revert "net/smc: Transitional solution for clcsock race issue" This reverts commit 802ad3c387b41c3c8a535c492a202c43a807a847. --- net/smc/af_smc.c | 63 +++++++++--------------------------------------- 1 file changed, 12 insertions(+), 51 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9fade0edf56d..f04a815a4a29 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -573,17 +573,12 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } -static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait; + wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); unsigned long flags; - mutex_lock(&smc->clcsock_release_lock); - if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; - } smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -598,30 +593,18 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * smc socket->wq, which should be removed * to clcsocket->wq during the fallback. */ - clc_wait = sk_sleep(smc->clcsock->sk); spin_lock_irqsave(&smc_wait->lock, flags); spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); } - mutex_unlock(&smc->clcsock_release_lock); - return 0; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - struct net *net = sock_net(&smc->sk); - int rc = 0; - - rc = smc_switch_to_fallback(smc, reason_code); - if (rc) { /* fallback fails */ - this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); - if (smc->sk.sk_state == SMC_INIT) - sock_put(&smc->sk); /* passive closing */ - return rc; - } + smc_switch_to_fallback(smc, reason_code); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -1542,12 +1525,11 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); - if (reason_code < 0 || - smc_switch_to_fallback(new_smc, reason_code)) { - /* error, no fallback possible */ + if (reason_code < 0) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } + smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1992,11 +1974,8 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - if (rc) - smc_listen_out_err(new_smc); - else - smc_listen_out_connected(new_smc); + smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + smc_listen_out_connected(new_smc); return; } @@ -2285,9 +2264,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); - if (rc) - goto out; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; goto out; @@ -2480,11 +2457,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - mutex_lock(&smc->clcsock_release_lock); - if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; - } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2494,7 +2466,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2511,7 +2482,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -2532,23 +2503,13 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; - int rc; smc = smc_sk(sock->sk); - mutex_lock(&smc->clcsock_release_lock); - if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; - } /* socket options apply to the CLC socket */ - if (unlikely(!smc->clcsock->ops->getsockopt)) { - mutex_unlock(&smc->clcsock_release_lock); + if (unlikely(!smc->clcsock->ops->getsockopt)) return -EOPNOTSUPP; - } - rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, - optval, optlen); - mutex_unlock(&smc->clcsock_release_lock); - return rc; + return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); } static int smc_ioctl(struct socket *sock, unsigned int cmd, -- Gitee From 4aa72db15fb5cb5da8b7acdc599c22c8baab8edd Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:14 +0800 Subject: [PATCH 020/148] Revert "net/smc: Avoid warning of possible recursive locking" This reverts commit d88f34a2246206d521c137a6b9b55597d92f8261. --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f04a815a4a29..03c66294aac0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -594,7 +594,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * to clcsocket->wq during the fallback. */ spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); + spin_lock(&clc_wait->lock); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); -- Gitee From 28a4920183741c7f1549cdd24e8dc200b6fe3830 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:22 +0800 Subject: [PATCH 021/148] Revert "net/smc: Transfer remaining wait queue entries during fallback" This reverts commit 3e5c3141cd8c51a8bcc3b66f77a991d42b7b83d6. --- net/smc/af_smc.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 03c66294aac0..c4a63deb4fe5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -575,10 +575,6 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); - unsigned long flags; - smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -588,16 +584,6 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. - */ - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock(&clc_wait->lock); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); } } -- Gitee From b1805fe1137d2d024bf335f4305858ad49504800 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:31 +0800 Subject: [PATCH 022/148] Revert "net/smc: Fix hung_task when removing SMC-R devices" This reverts commit b40dea31669aefc07c5c21d32cab6b824457e5ae. --- net/smc/smc_core.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bdc16e8cdee5..0728804406fa 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1550,11 +1550,16 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd) /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. + * + * We must wait here for QPs been destroyed before we destroy the CQs, + * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus + * smc_sock cannot be released. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + LIST_HEAD(lgr_linkdown_list); int i; spin_lock_bh(&smc_lgr_list.lock); @@ -1566,7 +1571,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) - smcr_link_down_cond_sched(&lgr->lnk[i]); + list_move_tail(&lgr->list, &lgr_linkdown_list); } } } @@ -1578,6 +1583,16 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) __smc_lgr_terminate(lgr, false); } + list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) { + mutex_lock(&lgr->llc_conf_mutex); + smcr_link_down_cond(&lgr->lnk[i]); + mutex_unlock(&lgr->llc_conf_mutex); + } + } + } + if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, -- Gitee From f9ee896cbe700f4c5fb8f2d01508e1f67687dc28 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:38 +0800 Subject: [PATCH 023/148] Revert "net/smc: Remove unused function declaration" This reverts commit 147ce0cd29d03bc96096aff756c39e8ee59e1268. --- net/smc/smc_wr.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..47512ccce5ef 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -125,6 +125,10 @@ int smc_wr_tx_v2_send(struct smc_link *link, int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, + smc_wr_tx_dismisser dismisser, + unsigned long data); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); -- Gitee From 854afe5567ebcb0902baffbaa45a72acd9379c96 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:46 +0800 Subject: [PATCH 024/148] Revert "net/smc: Resolve the race between SMC-R link access and clear" This reverts commit 52fca67ebdf9aede886dca86820cd09d5bc31593. --- net/smc/smc_core.c | 52 +++++++++++----------------------------------- net/smc/smc_core.h | 4 ---- 2 files changed, 12 insertions(+), 44 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0728804406fa..5d20f521ef76 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -762,8 +762,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); - refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ - lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; @@ -1016,12 +1014,8 @@ void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); - /* link_hold in smc_conn_create() */ - smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); - /* link_put in smc_conn_free() */ - smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, @@ -1184,8 +1178,6 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: - if (!lgr->is_smcd) - smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } @@ -1242,29 +1234,14 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } -static void __smcr_link_clear(struct smc_link *lnk) +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - smc_wr_free_link_mem(lnk); - smc_ibdev_cnt_dec(lnk); - put_device(&lnk->smcibdev->ibdev->dev); - smcibdev = lnk->smcibdev; - memset(lnk, 0, sizeof(struct smc_link)); - lnk->state = SMC_LNK_UNUSED; - if (!atomic_dec_return(&smcibdev->lnk_cnt)) - wake_up(&smcibdev->lnks_deleted); - smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ -} - -/* must be called under lgr->llc_conf_mutex lock */ -void smcr_link_clear(struct smc_link *lnk, bool log) -{ - if (!lnk->lgr || lnk->clearing || - lnk->state == SMC_LNK_UNUSED) + if (!lgr || lnk->state == SMC_LNK_UNUSED) return; - lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); @@ -1273,18 +1250,15 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); - smcr_link_put(lnk); /* theoretically last link_put */ -} - -void smcr_link_hold(struct smc_link *lnk) -{ - refcount_inc(&lnk->refcnt); -} - -void smcr_link_put(struct smc_link *lnk) -{ - if (refcount_dec_and_test(&lnk->refcnt)) - __smcr_link_clear(lnk); + smc_wr_free_link_mem(lnk); + smc_ibdev_cnt_dec(lnk); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -1952,8 +1926,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } } smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ - if (!conn->lgr->is_smcd) - smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35a85ec08919..8b4ed82785b9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -138,8 +138,6 @@ struct smc_link { u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ - u8 clearing : 1; /* link is being cleared */ - refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ @@ -529,8 +527,6 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); -void smcr_link_hold(struct smc_link *lnk); -void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); -- Gitee From 2852c6a7b525f919bd6e2c0c98fb62c182fdf715 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 16:59:54 +0800 Subject: [PATCH 025/148] Revert "net/smc: Introduce a new conn->lgr validity check helper" This reverts commit d798b8c9ffcf5fb13a987ba62d4fdca93d353ef9. --- net/smc/af_smc.c | 6 +----- net/smc/smc_cdc.c | 3 +-- net/smc/smc_clc.c | 2 +- net/smc/smc_core.c | 14 ++++++-------- net/smc/smc_core.h | 5 ----- net/smc/smc_diag.c | 6 +++--- 6 files changed, 12 insertions(+), 24 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c4a63deb4fe5..3832b8c1a8a1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -627,13 +627,9 @@ static void smc_conn_abort(struct smc_sock *smc, int local_first) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - bool lgr_valid = false; - - if (smc_conn_lgr_valid(conn)) - lgr_valid = true; smc_conn_free(conn); - if (local_first && lgr_valid) + if (local_first) smc_lgr_cleanup_early(lgr); } diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index fe72e416d926..2b453894ed38 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -202,8 +202,7 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; - if (!smc_conn_lgr_valid(conn) || - (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) return -EPIPE; if (conn->lgr->is_smcd) { diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ce27399b38b1..6be95a2a7b25 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,7 +774,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? SMC_FIRST_CONTACT_MASK : 0; - if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && + if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5d20f521ef76..cc1611937b9b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -211,7 +211,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!smc_conn_lgr_valid(conn)) + if (!lgr) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { @@ -1155,7 +1155,7 @@ void smc_conn_free(struct smc_connection *conn) return; conn->freed = 1; - if (!smc_conn_lgr_valid(conn)) + if (!conn->alert_token_local) /* Connection has already unregistered from * link group. */ @@ -2312,16 +2312,14 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || - !smc_link_active(conn->lnk)) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || - !smc_link_active(conn->lnk)) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -2330,7 +2328,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) @@ -2344,7 +2342,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { int i; - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8b4ed82785b9..fb5661d7a771 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -411,11 +411,6 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } -static inline bool smc_conn_lgr_valid(struct smc_connection *conn) -{ - return conn->lgr && conn->alert_token_local; -} - /* * Returns true if the specified link is usable. * diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index bbe00b50b666..1fa7c7cf9332 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -89,7 +89,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; - else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) + else if (smc->conn.lgr && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; @@ -148,7 +148,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && + if (smc->conn.lgr && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { @@ -168,7 +168,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } - if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && + if (smc->conn.lgr && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; -- Gitee From aa0faf992a1534bd0b6f059ba4f6fd7617a83398 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:00:02 +0800 Subject: [PATCH 026/148] Revert "net/smc: Resolve the race between link group access and termination" This reverts commit 27442fe0b3e7db3b4a7527b5ec553f30d60d7ca9. --- net/smc/smc.h | 1 - net/smc/smc_core.c | 60 +++++++++------------------------------------- net/smc/smc_core.h | 3 --- 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index a7177ec11f87..dca5f1c5ef00 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -225,7 +225,6 @@ struct smc_connection { u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ - u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index cc1611937b9b..034604fe0db0 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -218,6 +218,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) @@ -765,7 +766,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; - smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); @@ -820,7 +820,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); - smc_lgr_put(lgr); /* lgr_hold above */ return rc; } @@ -859,7 +858,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ mutex_init(&lgr->sndbufs_lock); mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -1148,19 +1146,8 @@ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr || conn->freed) - /* Connection has never been registered in a - * link group, or has already been freed. - */ + if (!lgr) return; - - conn->freed = 1; - if (!conn->alert_token_local) - /* Connection has already unregistered from - * link group. - */ - goto lgr_put; - if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); @@ -1177,8 +1164,6 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); -lgr_put: - smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ @@ -1237,10 +1222,9 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { - struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - if (!lgr || lnk->state == SMC_LNK_UNUSED) + if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); @@ -1258,7 +1242,6 @@ void smcr_link_clear(struct smc_link *lnk, bool log) lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); - smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -1326,21 +1309,6 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) __smc_lgr_free_bufs(lgr, true); } -/* won't be freed until no one accesses to lgr anymore */ -static void __smc_lgr_free(struct smc_link_group *lgr) -{ - smc_lgr_free_bufs(lgr); - if (lgr->is_smcd) { - if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) - wake_up(&lgr->smcd->lgrs_deleted); - } else { - smc_wr_free_lgr_mem(lgr); - if (!atomic_dec_return(&lgr_cnt)) - wake_up(&lgrs_deleted); - } - kfree(lgr); -} - /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { @@ -1356,23 +1324,19 @@ static void smc_lgr_free(struct smc_link_group *lgr) smc_llc_lgr_clear(lgr); } + smc_lgr_free_bufs(lgr); destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { + smc_wr_free_lgr_mem(lgr); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); } - smc_lgr_put(lgr); /* theoretically last lgr_put */ -} - -void smc_lgr_hold(struct smc_link_group *lgr) -{ - refcount_inc(&lgr->refcnt); -} - -void smc_lgr_put(struct smc_link_group *lgr) -{ - if (refcount_dec_and_test(&lgr->refcnt)) - __smc_lgr_free(lgr); + kfree(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) @@ -1925,8 +1889,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) goto out; } } - smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ - conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fb5661d7a771..e248a2d3672b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -250,7 +250,6 @@ struct smc_link_group { u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ - refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -489,8 +488,6 @@ struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); -void smc_lgr_hold(struct smc_link_group *lgr); -void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, -- Gitee From 3eadac4ebd7c0c1f9bcf30200c59369b3de4250c Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:00:10 +0800 Subject: [PATCH 027/148] Revert "net/smc: Reset conn->lgr when link group registration fails" This reverts commit dfaab93010dba0038c6926dcbc62ce073b12ff12. --- net/smc/af_smc.c | 8 +++----- net/smc/smc_core.c | 12 +++++------- net/smc/smc_core.h | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3832b8c1a8a1..f0abb77f858a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -625,12 +625,10 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, static void smc_conn_abort(struct smc_sock *smc, int local_first) { - struct smc_connection *conn = &smc->conn; - struct smc_link_group *lgr = conn->lgr; - - smc_conn_free(conn); if (local_first) - smc_lgr_cleanup_early(lgr); + smc_lgr_cleanup_early(&smc->conn); + else + smc_conn_free(&smc->conn); } /* check if there is a rdma device available for this connection. */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 034604fe0db0..5eb4aea80355 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -171,10 +171,8 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); - if (rc) { - conn->lgr = NULL; + if (rc) return rc; - } } /* find a new alert_token_local value not yet used by some connection * in this link group @@ -624,13 +622,15 @@ int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void smc_lgr_cleanup_early(struct smc_link_group *lgr) +void smc_lgr_cleanup_early(struct smc_connection *conn) { + struct smc_link_group *lgr = conn->lgr; spinlock_t *lgr_lock; if (!lgr) return; + smc_conn_free(conn); smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ @@ -1884,10 +1884,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); - if (rc) { - smc_lgr_cleanup_early(lgr); + if (rc) goto out; - } } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e248a2d3672b..ebca96fe3a2b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -486,7 +486,7 @@ static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_sock; struct smc_clc_msg_accept_confirm; -void smc_lgr_cleanup_early(struct smc_link_group *lgr); +void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); -- Gitee From 56e4b832a8f8a24ecc7b3f75680dcffb72e05032 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:04:34 +0800 Subject: [PATCH 028/148] Revert "Revert "anolis: net/smc: Reset conn->lgr when link group registration failed"" This reverts commit 9e4d801cd9bb03c190dda06ab672ee96979a10d9. --- net/smc/smc_core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5eb4aea80355..7ca337b6ae38 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -171,8 +171,10 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); - if (rc) + if (rc) { + conn->lgr = NULL; return rc; + } } /* find a new alert_token_local value not yet used by some connection * in this link group @@ -1884,8 +1886,14 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); - if (rc) + if (rc) { + spin_lock_bh(lgr_lock); + if (!list_empty(&lgr->list)) + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); + __smc_lgr_terminate(lgr, true); goto out; + } } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; -- Gitee From 3228b4585ab53f7f0d402115b4321ebf519e9e7f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:04:41 +0800 Subject: [PATCH 029/148] Revert "Revert "anolis: net/smc: Resolve the race between link group access and termination"" This reverts commit f2820bdbc405bdf26f50d825a88cce1e33527543. --- net/smc/smc.h | 1 + net/smc/smc_core.c | 42 ++++++++++++++++++++++++++++++++++++++---- net/smc/smc_core.h | 3 +++ 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index dca5f1c5ef00..a7177ec11f87 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -225,6 +225,7 @@ struct smc_connection { u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7ca337b6ae38..3f724b826527 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -186,6 +186,7 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); + smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ conn->lgr->conns_num++; return 0; } @@ -218,7 +219,6 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - conn->lgr = NULL; } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) @@ -768,6 +768,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; + smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); @@ -860,6 +861,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; + refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ mutex_init(&lgr->sndbufs_lock); mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -1148,8 +1150,20 @@ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!lgr || conn->freed) + /* smc connection wasn't registered to a link group + * or has already been freed before. + * + * Judge these to ensure that lgr refcnt will be put + * only once if connection has been registered to a + * link group successfully. + */ return; + + conn->freed = 1; + if (conn->killed) + goto lgr_put; + if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); @@ -1166,6 +1180,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); +lgr_put: + smc_lgr_put(lgr); /* lgr_hold in smc_lgr_register_conn() */ } /* unregister a link from a buf_desc */ @@ -1237,6 +1253,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; @@ -1311,6 +1328,13 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) __smc_lgr_free_bufs(lgr, true); } +/* won't be freed until no one accesses to lgr anymore */ +static void __smc_lgr_free(struct smc_link_group *lgr) +{ + smc_lgr_free_bufs(lgr); + kfree(lgr); +} + /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { @@ -1326,7 +1350,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) smc_llc_lgr_clear(lgr); } - smc_lgr_free_bufs(lgr); destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); @@ -1338,7 +1361,18 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } - kfree(lgr); + smc_lgr_put(lgr); /* theoretically last lgr_put */ +} + +void smc_lgr_hold(struct smc_link_group *lgr) +{ + refcount_inc(&lgr->refcnt); +} + +void smc_lgr_put(struct smc_link_group *lgr) +{ + if (refcount_dec_and_test(&lgr->refcnt)) + __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index ebca96fe3a2b..35c76a6a45c5 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -250,6 +250,7 @@ struct smc_link_group { u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ + refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -488,6 +489,8 @@ struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); +void smc_lgr_hold(struct smc_link_group *lgr); +void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, -- Gitee From 75070256de1ed1376c84e9cc9d2a14eceaaceb6b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:04:50 +0800 Subject: [PATCH 030/148] Revert "Revert "anolis: net/smc: Resolve the race between SMC-R link access and clear"" This reverts commit 015e52bbe1bd22b263dc1008886cdf9040182ef5. --- net/smc/smc_core.c | 43 +++++++++++++++++++++++++++++++++++-------- net/smc/smc_core.h | 4 ++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3f724b826527..8153456d396f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -155,6 +155,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; atomic_inc(&conn->lnk->conn_cnt); + smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ return 0; } @@ -765,6 +766,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); + refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ + lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; @@ -1016,8 +1019,12 @@ void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); + /* put old link, hold in smcr_lgr_conn_assign_link() */ + smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); + /* hold new link, put in smc_conn_free() */ + smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, @@ -1154,9 +1161,9 @@ void smc_conn_free(struct smc_connection *conn) /* smc connection wasn't registered to a link group * or has already been freed before. * - * Judge these to ensure that lgr refcnt will be put - * only once if connection has been registered to a - * link group successfully. + * Judge these to ensure that lgr/link refcnt will be + * put only once if connection has been registered to + * a link group successfully. */ return; @@ -1181,6 +1188,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: + if (!lgr->is_smcd) + smcr_link_put(conn->lnk); /* link_hold in smcr_lgr_conn_assign_link() */ smc_lgr_put(lgr); /* lgr_hold in smc_lgr_register_conn() */ } @@ -1237,13 +1246,23 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } +void __smcr_link_clear(struct smc_link *lnk) +{ + smc_wr_free_link_mem(lnk); + smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; +} + /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { struct smc_ib_device *smcibdev; - if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) + if (lnk->clearing || !lnk->lgr || + lnk->state == SMC_LNK_UNUSED) return; + lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); @@ -1252,15 +1271,23 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); - smc_wr_free_link_mem(lnk); - smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; - memset(lnk, 0, sizeof(struct smc_link)); - lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smcr_link_put(lnk); /* theoretically last link_put */ +} + +void smcr_link_hold(struct smc_link *lnk) +{ + refcount_inc(&lnk->refcnt); +} + +void smcr_link_put(struct smc_link *lnk) +{ + if (refcount_dec_and_test(&lnk->refcnt)) + __smcr_link_clear(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35c76a6a45c5..0c2dc09cfccf 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -138,6 +138,8 @@ struct smc_link { u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ + u8 clearing : 1; /* link is being cleared */ + refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ @@ -522,6 +524,8 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); +void smcr_link_hold(struct smc_link *lnk); +void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); -- Gitee From 5d65f17c98f819c92055a900a07ea4f1b0abcf7d Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:02 +0800 Subject: [PATCH 031/148] Revert "Revert "anolis: net/smc: Avoid setting clcsock options after clcsock released"" This reverts commit c1c311ca2b87198093c44e58c47a5c3f65d78551. --- net/smc/af_smc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f0abb77f858a..e90cbb1a1ad3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2437,6 +2437,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2446,6 +2451,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2483,13 +2489,21 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; + int rc; smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) return -EOPNOTSUPP; - return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, -- Gitee From eaa4d6a6503642ce189349e07857e09a75b56332 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:21 +0800 Subject: [PATCH 032/148] Revert "Revert "anolis: net/smc: Forward wakeup to smc socket wait queue when fallback"" This reverts commit 1a8fbe8e02db4ddeaa0ffef740b04ade092873a9. --- net/smc/af_smc.c | 143 +++++++++++++++++++++++++++++++++++++++++++---- net/smc/smc.h | 12 ++++ 2 files changed, 145 insertions(+), 10 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e90cbb1a1ad3..70d09b779fe1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -67,6 +67,10 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +static void smc_clcsock_state_change(struct sock *clcsk); +static void smc_clcsock_data_ready(struct sock *clcsk); +static void smc_clcsock_write_space(struct sock *clcsk); +static void smc_clcsock_error_report(struct sock *clcsk); static void smc_set_keepalive(struct sock *sk, int val) { @@ -575,6 +579,8 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { + struct sock *clcsk = smc->clcsock->sk; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -584,6 +590,19 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_clcsock_state_change; + clcsk->sk_data_ready = smc_clcsock_data_ready; + clcsk->sk_write_space = smc_clcsock_write_space; + clcsk->sk_error_report = smc_clcsock_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } } @@ -2082,20 +2101,124 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -static void smc_clcsock_data_ready(struct sock *listen_clcsock) +static void smc_wake_up_waitqueue(struct smc_sock *smc, void *key) { - struct smc_sock *lsmc; + struct socket_wq *wq; + __poll_t flags; + + rcu_read_lock(); + wq = rcu_dereference(smc->sk.sk_wq); + if (skwq_has_sleeper(wq)) { + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } + } + rcu_read_unlock(); +} - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!lsmc) +static int smc_mark_clcwq_woken(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct smc_mark_wake_up *mark; + + mark = container_of(wait, struct smc_mark_wake_up, + wait_entry); + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_forward_wake_up(struct smc_sock *smc, struct sock *clcsk, + void (*clcsk_callback)(struct sock *sk)) +{ + struct smc_mark_wake_up mark = { .woken = false }; + struct socket_wq *wq; + + rcu_read_lock(); + /* ensure that clcsk->sk_wq still exists */ + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) { + rcu_read_unlock(); return; - lsmc->clcsk_data_ready(listen_clcsock); - if (lsmc->sk.sk_state == SMC_LISTEN) { - sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) - sock_put(&lsmc->sk); } + + init_waitqueue_func_entry(&mark.wait_entry, + smc_mark_clcwq_woken); + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsk_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + rcu_read_unlock(); + + if (mark.woken) + smc_wake_up_waitqueue(smc, mark.key); +} + +static void smc_clcsock_state_change(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + smc_forward_wake_up(smc, clcsk, smc->clcsk_state_change); +} + +static void smc_clcsock_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + if (!smc->use_fallback) { + /* listening situation */ + smc->clcsk_data_ready(clcsk); + if (smc->sk.sk_state == SMC_LISTEN) { + sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_hs_wq, &smc->tcp_listen_work)) + sock_put(&smc->sk); + } + } else { + /* fallback situation */ + smc_forward_wake_up(smc, clcsk, smc->clcsk_data_ready); + } +} + +static void smc_clcsock_write_space(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + smc_forward_wake_up(smc, clcsk, smc->clcsk_write_space); +} + +static void smc_clcsock_error_report(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + smc_forward_wake_up(smc, clcsk, smc->clcsk_error_report); } static int smc_listen(struct socket *sock, int backlog) diff --git a/net/smc/smc.h b/net/smc/smc.h index a7177ec11f87..4175ba6b7281 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -135,6 +135,12 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_wake_up { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -232,8 +238,14 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. **/ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ -- Gitee From b542923af342ba5bf191c6cf187fffa4d3ea1d73 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:28 +0800 Subject: [PATCH 033/148] Revert "Revert "net/smc: Reduce overflow of smc clcsock listen queue"" This reverts commit d2eb57c256eb24ab069b9edd2478b0794aaac5de. --- net/smc/af_smc.c | 13 +++++++++++-- net/smc/smc.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 70d09b779fe1..f0554e09adcf 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -62,6 +62,7 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ +struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ @@ -2188,7 +2189,7 @@ static void smc_clcsock_data_ready(struct sock *clcsk) smc->clcsk_data_ready(clcsk); if (smc->sk.sk_state == SMC_LISTEN) { sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &smc->tcp_listen_work)) + if (!queue_work(smc_tcp_ls_wq, &smc->tcp_listen_work)) sock_put(&smc->sk); } } else { @@ -2996,9 +2997,14 @@ static int __init smc_init(void) goto out_nl; rc = -ENOMEM; + + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + if (!smc_tcp_ls_wq) + goto out_pnet; + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) - goto out_pnet; + goto out_alloc_tcp_ls_wq; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) @@ -3102,6 +3108,8 @@ static int __init smc_init(void) destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); +out_alloc_tcp_ls_wq: + destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: @@ -3122,6 +3130,7 @@ static void __exit smc_exit(void) smc_core_exit(); smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); diff --git a/net/smc/smc.h b/net/smc/smc.h index 4175ba6b7281..e56a776b5034 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -280,6 +280,7 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +extern struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ -- Gitee From 8b66abd67ff722f1ccd46f8c5f74da9a31de3ca6 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:35 +0800 Subject: [PATCH 034/148] Revert "anolis: Revert "anolis: net/smc: Add SMC-R link-down counters"" This reverts commit 32a8ebaa59cb94b24024df58c32bbaf60bf0fa13. --- net/smc/smc_core.c | 14 +++++++++++--- net/smc/smc_core.h | 2 ++ net/smc/smc_diag.c | 2 ++ net/smc/smc_llc.c | 3 +++ net/smc/smc_tx.c | 4 +++- net/smc/smc_wr.c | 5 +++++ 6 files changed, 26 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8153456d396f..6e88c2a8ec97 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -773,6 +773,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; + lnk->link_down_cnt_smc = 0; + lnk->link_down_cnt_ib = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); @@ -1083,16 +1085,20 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) + if (rc) { + ++to_lnk->link_down_cnt_smc; goto err_out; + } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) + if (rc) { + ++to_lnk->link_down_cnt_ib; goto err_out; + } goto again; } read_unlock_bh(&lgr->conns_lock); @@ -1758,8 +1764,10 @@ void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && - lnk->smcibdev == smcibdev && lnk->ibport == ibport) + lnk->smcibdev == smcibdev && lnk->ibport == ibport) { + ++lnk->link_down_cnt_ib; smcr_link_down_cond_sched(lnk); + } } } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 0c2dc09cfccf..4d4ab95dffb0 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -150,6 +150,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + u64 link_down_cnt_smc; /* smc-caused link down counter */ + u64 link_down_cnt_ib; /* ib-caused link down counter */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 1fa7c7cf9332..ddecd39aa4a4 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -155,6 +155,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .role = smc->conn.lgr->role, .lnk[0].ibport = smc->conn.lnk->ibport, .lnk[0].link_id = smc->conn.lnk->link_id, + .lnk[0].link_down_cnt_smc = smc->conn.lnk->link_down_cnt_smc, + .lnk[0].link_down_cnt_ib = smc->conn.lnk->link_down_cnt_ib, }; memcpy(linfo.lnk[0].ibname, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1d8dafa1a35e..1be23778c6e3 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1281,12 +1281,14 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_NO_ASYM_NEEDED); if (rc) { + ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, SMC_LLC_DELETE_LINK); if (!qentry) { + ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } @@ -2100,6 +2102,7 @@ static void smc_llc_testlink_work(struct work_struct *work) if (!smc_link_active(link)) return; /* link state changed */ if (rc <= 0) { + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); return; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 68d62ac63dec..8f1d8cd13191 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -334,8 +334,10 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) + if (rc) { + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); + } return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ca179e2c86b7..de2a383322cb 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -125,6 +125,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2)); } + ++link->link_down_cnt_ib; /* terminate link */ smcr_link_down_cond_sched(link); } @@ -219,6 +220,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { + ++link->link_down_cnt_smc; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -326,6 +328,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); } return rc; @@ -382,6 +385,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (atomic_dec_and_test(&link->wr_reg_refcnt)) wake_up_all(&link->wr_reg_wait); if (!rc) { + ++link->link_down_cnt_ib; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -462,6 +466,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); break; default: -- Gitee From 4350da780571310356a91de12b9f08fd54ead52a Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:43 +0800 Subject: [PATCH 035/148] Revert "anolis: net/smc: support auto-cork with nagle algorithm" This reverts commit 88d63646ab348d9dc38b2c75d8c5f63ee546d257. --- net/smc/af_smc.c | 24 +++++++++++-- net/smc/smc.h | 2 -- net/smc/smc_cdc.c | 11 ++---- net/smc/smc_sysctl.c | 9 ----- net/smc/smc_tx.c | 86 +++++++------------------------------------- 5 files changed, 38 insertions(+), 94 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f0554e09adcf..1ce39e03ac11 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2597,6 +2597,28 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, rc = -EINVAL; } break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (val) { + SMC_STAT_INC(smc, ndly_cnt); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); + } + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (!val) { + SMC_STAT_INC(smc, cork_cnt); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); + } + } + break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; @@ -2936,7 +2958,6 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 0; - net->smc.sysctl_autocorking = 1; } return smc_pnet_net_init(net); @@ -3083,7 +3104,6 @@ static int __init smc_init(void) init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_allow_different_subnet = 0; - init_net.smc.sysctl_autocorking = 1; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc.h b/net/smc/smc.h index e56a776b5034..ce3007104042 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -187,8 +187,6 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ - atomic_t tx_pushing; /* nr_threads trying tx push */ - struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 2b453894ed38..84c8a4374fdd 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,14 +48,9 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { - /* If this is the last pending WR complete, push them to prevent - * no one trying to push when corked. - */ - smc_tx_sndbuf_nonempty(conn); - if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); - } + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && + unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 7f4e0912dd97..db64caca933a 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -44,15 +44,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "autocorking", - .data = &init_net.smc.sysctl_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 8f1d8cd13191..efe0b393a5fe 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -31,6 +31,7 @@ #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -124,37 +125,11 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) return rc; } -/* Strategy: Nagle algorithm - * 1. The first message should never cork - * 2. If we have any inflight messages, wait for the first - * message back - * 3. The total corked message should not exceed min(64k, sendbuf/2) - */ -static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +static bool smc_tx_is_corked(struct smc_sock *smc) { - struct smc_connection *conn = &smc->conn; - int prepared_send; - - /* First request && no more message should always pass */ - if (atomic_read(&conn->cdc_pend_tx_wr) == 0 && - !(msg->msg_flags & MSG_MORE)) - return false; - - /* If We have enough data in the send queue that have not been - * pushed, send immediately. - * Note, here we only care about the prepared_sends, but not - * sendbuf_space because sendbuf_space has nothing to do with - * corked data size. - */ - prepared_send = smc_tx_prepared_sends(conn); - if (prepared_send > min(64 * 1024, conn->sndbuf_desc->len >> 1)) - return false; - - if (!sock_net(&smc->sk)->smc.sysctl_autocorking) - return false; - - /* All the other conditions should cork */ - return true; + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } /* sndbuf producer: main API called by socket layer. @@ -203,13 +178,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; - /* If our send queue is full but peer have RMBE space, - * we should send them out before wait - */ - if (!atomic_read(&conn->sndbuf_space) && - atomic_read(&conn->peer_rmbe_space) > 0) - smc_tx_sndbuf_nonempty(conn); - if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { @@ -269,17 +237,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if (smc_tx_should_cork(smc, msg)) { + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_desc->len >> 1))) { /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ conn->tx_corked_bytes += copylen; ++conn->tx_corked_cnt; + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_CORK_DELAY); } else { conn->tx_bytes += copylen; ++conn->tx_cnt; - if (delayed_work_pending(&conn->tx_work)) - cancel_delayed_work(&conn->tx_work); smc_tx_sndbuf_nonempty(conn); } @@ -616,31 +586,11 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc = 0; - struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - - /* Only let one to push to prevent wasting of CPU and CDC slot */ - if (atomic_inc_return(&conn->tx_pushing) > 1) - return 0; - -again: - atomic_set(&conn->tx_pushing, 1); - - /* No data in the send queue */ - if (unlikely(smc_tx_prepared_sends(conn) <= 0)) - goto out; - - /* Peer don't have RMBE space */ - if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { - SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); - goto out; - } + int rc; if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { - rc = -EPIPE; /* connection being aborted */ - goto out; - } + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -652,16 +602,6 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) conn); smc_close_wake_tx_prepared(smc); } - -out: - /* We need to check whether someone else have added some data into - * the send queue and tried to push but failed when we are pushing. - * If so, we need to try push again to prevent those data in the - * send queue may never been pushed out - */ - if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) - goto again; - return rc; } -- Gitee From 99d93fad60ba04c5dd92b0e1b42e29e6c3c29fec Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:50 +0800 Subject: [PATCH 036/148] Revert "net/smc: Reduce overflow of smc clcsock listen queue" This reverts commit d6f7dc474ea41cdf42dfa026d395a677e1902ccd. --- net/smc/af_smc.c | 13 ++----------- net/smc/smc.h | 1 - 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1ce39e03ac11..b75eafc2d1c0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -62,7 +62,6 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ -struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ @@ -2189,7 +2188,7 @@ static void smc_clcsock_data_ready(struct sock *clcsk) smc->clcsk_data_ready(clcsk); if (smc->sk.sk_state == SMC_LISTEN) { sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &smc->tcp_listen_work)) + if (!queue_work(smc_hs_wq, &smc->tcp_listen_work)) sock_put(&smc->sk); } } else { @@ -3018,14 +3017,9 @@ static int __init smc_init(void) goto out_nl; rc = -ENOMEM; - - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); - if (!smc_tcp_ls_wq) - goto out_pnet; - smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) - goto out_alloc_tcp_ls_wq; + goto out_pnet; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) @@ -3128,8 +3122,6 @@ static int __init smc_init(void) destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); -out_alloc_tcp_ls_wq: - destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: @@ -3150,7 +3142,6 @@ static void __exit smc_exit(void) smc_core_exit(); smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); - destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); diff --git a/net/smc/smc.h b/net/smc/smc.h index ce3007104042..13cd350be773 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -278,7 +278,6 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -extern struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ -- Gitee From 4ecf1629e92d7fb9a5ee805b57509336f5f5ed05 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:05:57 +0800 Subject: [PATCH 037/148] Revert "anolis: net/smc: Forward wakeup to smc socket wait queue when fallback" This reverts commit 0262e7a410fd5ed2c0991fbe94058c5ddb705055. --- net/smc/af_smc.c | 143 ++++------------------------------------------- net/smc/smc.h | 12 ---- 2 files changed, 10 insertions(+), 145 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b75eafc2d1c0..86da7113774c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -67,10 +67,6 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -static void smc_clcsock_state_change(struct sock *clcsk); -static void smc_clcsock_data_ready(struct sock *clcsk); -static void smc_clcsock_write_space(struct sock *clcsk); -static void smc_clcsock_error_report(struct sock *clcsk); static void smc_set_keepalive(struct sock *sk, int val) { @@ -579,8 +575,6 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - struct sock *clcsk = smc->clcsock->sk; - smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -590,19 +584,6 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_clcsock_state_change; - clcsk->sk_data_ready = smc_clcsock_data_ready; - clcsk->sk_write_space = smc_clcsock_write_space; - clcsk->sk_error_report = smc_clcsock_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } } @@ -2101,126 +2082,22 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -static void smc_wake_up_waitqueue(struct smc_sock *smc, void *key) -{ - struct socket_wq *wq; - __poll_t flags; - - rcu_read_lock(); - wq = rcu_dereference(smc->sk.sk_wq); - if (skwq_has_sleeper(wq)) { - if (!key) { - /* sk_state_change */ - wake_up_interruptible_all(&wq->wait); - } else { - flags = key_to_poll(key); - if (flags & (EPOLLIN | EPOLLOUT)) - /* sk_data_ready or sk_write_space */ - wake_up_interruptible_sync_poll(&wq->wait, flags); - else if (flags & EPOLLERR) - /* sk_error_report */ - wake_up_interruptible_poll(&wq->wait, flags); - } - } - rcu_read_unlock(); -} - -static int smc_mark_clcwq_woken(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *key) +static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_mark_wake_up *mark; - - mark = container_of(wait, struct smc_mark_wake_up, - wait_entry); - mark->woken = true; - mark->key = key; - return 0; -} - -static void smc_forward_wake_up(struct smc_sock *smc, struct sock *clcsk, - void (*clcsk_callback)(struct sock *sk)) -{ - struct smc_mark_wake_up mark = { .woken = false }; - struct socket_wq *wq; - - rcu_read_lock(); - /* ensure that clcsk->sk_wq still exists */ - wq = rcu_dereference(clcsk->sk_wq); - if (!wq) { - rcu_read_unlock(); - return; - } - - init_waitqueue_func_entry(&mark.wait_entry, - smc_mark_clcwq_woken); - add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); - clcsk_callback(clcsk); - remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); - rcu_read_unlock(); - - if (mark.woken) - smc_wake_up_waitqueue(smc, mark.key); -} - -static void smc_clcsock_state_change(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up(smc, clcsk, smc->clcsk_state_change); -} - -static void smc_clcsock_data_ready(struct sock *clcsk) -{ - struct smc_sock *smc; + struct smc_sock *lsmc; - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) + lsmc = (struct smc_sock *) + ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!lsmc) return; - - if (!smc->use_fallback) { - /* listening situation */ - smc->clcsk_data_ready(clcsk); - if (smc->sk.sk_state == SMC_LISTEN) { - sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &smc->tcp_listen_work)) - sock_put(&smc->sk); - } - } else { - /* fallback situation */ - smc_forward_wake_up(smc, clcsk, smc->clcsk_data_ready); + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); } } -static void smc_clcsock_write_space(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up(smc, clcsk, smc->clcsk_write_space); -} - -static void smc_clcsock_error_report(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up(smc, clcsk, smc->clcsk_error_report); -} - static int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; diff --git a/net/smc/smc.h b/net/smc/smc.h index 13cd350be773..5709f47fccd2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -135,12 +135,6 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; -struct smc_mark_wake_up { - bool woken; - void *key; - wait_queue_entry_t wait_entry; -}; - struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -236,14 +230,8 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ - void (*clcsk_state_change)(struct sock *sk); - /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. **/ - void (*clcsk_write_space)(struct sock *sk); - /* original write_space fct. */ - void (*clcsk_error_report)(struct sock *sk); - /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ -- Gitee From 71d2037acaf11120b838735fb1fda5dca2b34572 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:05 +0800 Subject: [PATCH 038/148] Revert "anolis: net/smc: Avoid setting clcsock options after clcsock released" This reverts commit 1598f11a5bee15ca1eba18e03c4686d3edc5e122. --- net/smc/af_smc.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 86da7113774c..e6e3f4e20618 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2437,11 +2437,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - mutex_lock(&smc->clcsock_release_lock); - if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; - } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2451,7 +2446,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2511,21 +2505,13 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; - int rc; smc = smc_sk(sock->sk); - mutex_lock(&smc->clcsock_release_lock); - if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; - } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) return -EOPNOTSUPP; - rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); - mutex_unlock(&smc->clcsock_release_lock); - return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, -- Gitee From c49bb6ba2c612c65b75a3f935c8304cbc5cfec5b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:13 +0800 Subject: [PATCH 039/148] Revert "anolis: net/smc: Resolve the race between SMC-R link access and clear" This reverts commit 19623184f623140fe89cf31f0334785644a2f0c0. --- net/smc/smc_core.c | 43 ++++++++----------------------------------- net/smc/smc_core.h | 4 ---- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 6e88c2a8ec97..49fe804a593b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -155,7 +155,6 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; atomic_inc(&conn->lnk->conn_cnt); - smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ return 0; } @@ -766,8 +765,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); - refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ - lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; @@ -1021,12 +1018,8 @@ void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); - /* put old link, hold in smcr_lgr_conn_assign_link() */ - smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); - /* hold new link, put in smc_conn_free() */ - smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, @@ -1167,9 +1160,9 @@ void smc_conn_free(struct smc_connection *conn) /* smc connection wasn't registered to a link group * or has already been freed before. * - * Judge these to ensure that lgr/link refcnt will be - * put only once if connection has been registered to - * a link group successfully. + * Judge these to ensure that lgr refcnt will be put + * only once if connection has been registered to a + * link group successfully. */ return; @@ -1194,8 +1187,6 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: - if (!lgr->is_smcd) - smcr_link_put(conn->lnk); /* link_hold in smcr_lgr_conn_assign_link() */ smc_lgr_put(lgr); /* lgr_hold in smc_lgr_register_conn() */ } @@ -1252,23 +1243,13 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } -void __smcr_link_clear(struct smc_link *lnk) -{ - smc_wr_free_link_mem(lnk); - smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ - memset(lnk, 0, sizeof(struct smc_link)); - lnk->state = SMC_LNK_UNUSED; -} - /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { struct smc_ib_device *smcibdev; - if (lnk->clearing || !lnk->lgr || - lnk->state == SMC_LNK_UNUSED) + if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) return; - lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); @@ -1277,23 +1258,15 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); + smc_wr_free_link_mem(lnk); + smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); - smcr_link_put(lnk); /* theoretically last link_put */ -} - -void smcr_link_hold(struct smc_link *lnk) -{ - refcount_inc(&lnk->refcnt); -} - -void smcr_link_put(struct smc_link *lnk) -{ - if (refcount_dec_and_test(&lnk->refcnt)) - __smcr_link_clear(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 4d4ab95dffb0..5ab4fdb433c0 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -138,8 +138,6 @@ struct smc_link { u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ - u8 clearing : 1; /* link is being cleared */ - refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ @@ -526,8 +524,6 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); -void smcr_link_hold(struct smc_link *lnk); -void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); -- Gitee From 22d5134e14dc19de86a77ca0717f8e6fe46ee982 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:20 +0800 Subject: [PATCH 040/148] Revert "anolis: net/smc: Resolve the race between link group access and termination" This reverts commit f75d9ba25d744ff918ec273b9cfbe3f6068a0123. --- net/smc/smc.h | 1 - net/smc/smc_core.c | 42 ++++-------------------------------------- net/smc/smc_core.h | 3 --- 3 files changed, 4 insertions(+), 42 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 5709f47fccd2..449ccf6d7dd2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -223,7 +223,6 @@ struct smc_connection { u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ - u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 49fe804a593b..09939b7e9d1e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -186,7 +186,6 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); - smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ conn->lgr->conns_num++; return 0; } @@ -219,6 +218,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) @@ -768,7 +768,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; - smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; lnk->link_down_cnt_smc = 0; lnk->link_down_cnt_ib = 0; @@ -863,7 +862,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ mutex_init(&lgr->sndbufs_lock); mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -1156,20 +1154,8 @@ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr || conn->freed) - /* smc connection wasn't registered to a link group - * or has already been freed before. - * - * Judge these to ensure that lgr refcnt will be put - * only once if connection has been registered to a - * link group successfully. - */ + if (!lgr) return; - - conn->freed = 1; - if (conn->killed) - goto lgr_put; - if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); @@ -1186,8 +1172,6 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); -lgr_put: - smc_lgr_put(lgr); /* lgr_hold in smc_lgr_register_conn() */ } /* unregister a link from a buf_desc */ @@ -1259,7 +1243,6 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); - smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; @@ -1334,13 +1317,6 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) __smc_lgr_free_bufs(lgr, true); } -/* won't be freed until no one accesses to lgr anymore */ -static void __smc_lgr_free(struct smc_link_group *lgr) -{ - smc_lgr_free_bufs(lgr); - kfree(lgr); -} - /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { @@ -1356,6 +1332,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) smc_llc_lgr_clear(lgr); } + smc_lgr_free_bufs(lgr); destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); @@ -1367,18 +1344,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } - smc_lgr_put(lgr); /* theoretically last lgr_put */ -} - -void smc_lgr_hold(struct smc_link_group *lgr) -{ - refcount_inc(&lgr->refcnt); -} - -void smc_lgr_put(struct smc_link_group *lgr) -{ - if (refcount_dec_and_test(&lgr->refcnt)) - __smc_lgr_free(lgr); + kfree(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5ab4fdb433c0..d1a360b8cb6e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -252,7 +252,6 @@ struct smc_link_group { u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ - refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -491,8 +490,6 @@ struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); -void smc_lgr_hold(struct smc_link_group *lgr); -void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, -- Gitee From 29898086b41eec5efed856f9e90a2a73b965ff6d Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:28 +0800 Subject: [PATCH 041/148] Revert "anolis: net/smc: Reset conn->lgr when link group registration failed" This reverts commit 545513b3350617144c1088c8a26b7de567004deb. --- net/smc/smc_core.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 09939b7e9d1e..3fb8dc3d0708 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -171,10 +171,8 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); - if (rc) { - conn->lgr = NULL; + if (rc) return rc; - } } /* find a new alert_token_local value not yet used by some connection * in this link group @@ -1894,14 +1892,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); - if (rc) { - spin_lock_bh(lgr_lock); - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(lgr_lock); - __smc_lgr_terminate(lgr, true); + if (rc) goto out; - } } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; -- Gitee From 0ec551c8062c7b553ced00320be0e6a9f776ac79 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:37 +0800 Subject: [PATCH 042/148] Revert "anolis: net/smc: Avoid unmapping bufs from unused links" This reverts commit 5eb211946d37d1073de33fcf257a32259bfefc54. --- net/smc/smc_core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3fb8dc3d0708..33ed59e2996c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1255,11 +1255,8 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_UNUSED) - continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - } if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); -- Gitee From 9c360c50c5f097f60b2660a16bb37fc99d55eaa2 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:46 +0800 Subject: [PATCH 043/148] Revert "anolis: Revert "net/smc: Transfer remaining wait queue entries during fallback"" This reverts commit 565211c9b4f28a1218551f8ee6393c67d356e038. --- net/smc/af_smc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e6e3f4e20618..2bd94bf6d9fb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -575,6 +575,10 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { + wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); + wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + unsigned long flags; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -584,6 +588,16 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + /* There may be some entries remaining in + * smc socket->wq, which should be removed + * to clcsocket->wq during the fallback. + */ + spin_lock_irqsave(&smc_wait->lock, flags); + spin_lock(&clc_wait->lock); + list_splice_init(&smc_wait->head, &clc_wait->head); + spin_unlock(&clc_wait->lock); + spin_unlock_irqrestore(&smc_wait->lock, flags); } } -- Gitee From e4c9af994c086392896ce20a1cd85ffa5a2948c4 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:06:53 +0800 Subject: [PATCH 044/148] Revert "anolis: Revert "net/smc: Avoid warning of possible recursive locking"" This reverts commit 003d7ccceb159ab96b41fa15f8c605cc360992a8. --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2bd94bf6d9fb..9f621d860b88 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -594,7 +594,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * to clcsocket->wq during the fallback. */ spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock(&clc_wait->lock); + spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); -- Gitee From 7266e3a18bdac3fe7c0fb63bc29a1d1a6dace840 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:01 +0800 Subject: [PATCH 045/148] Revert "anolis: net/smc: allow different subnet communication" This reverts commit b50313af5213b8b230d617dff5a2b068877db3d1. --- net/smc/af_smc.c | 14 ++++---------- net/smc/smc_sysctl.c | 9 --------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9f621d860b88..3408f807b613 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1872,7 +1872,6 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -1880,12 +1879,10 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - if (!net->smc.sysctl_allow_different_subnet) { - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); - } + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2833,7 +2830,6 @@ static __net_init int smc_net_init(struct net *net) net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 0; } return smc_pnet_net_init(net); @@ -2842,7 +2838,6 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 0; smc_pnet_net_exit(net); } @@ -2974,7 +2969,6 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; - init_net.smc.sysctl_allow_different_subnet = 0; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index db64caca933a..e3942837c3e3 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -35,15 +35,6 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "allow_different_subnet", - .data = &init_net.smc.sysctl_allow_different_subnet, - .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; -- Gitee From e2ba77a67ba6cfccf30ec99203d8bafa5f951bb6 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:08 +0800 Subject: [PATCH 046/148] Revert "anolis: net/smc: don't call ib_req_notify_cq in the send routine" This reverts commit d6cfb59df3534343d5d267e8a801dbbdedf6a16b. --- net/smc/smc_ib.c | 6 ------ net/smc/smc_wr.c | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 8e2b1af1d291..65bf38cac7fd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,12 +135,6 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; - - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc) - goto out; - rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index de2a383322cb..c1a8c267975f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -308,6 +308,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 1170f7e8d4c42eda46393735f3ab544cb73d1be5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:14 +0800 Subject: [PATCH 047/148] Revert "anolis: net/smc: Add SMC-R link-down counters" This reverts commit 706face7ee0371c6569bff67e4701cd9b5c0e0cb. --- net/smc/smc_core.c | 14 +++----------- net/smc/smc_core.h | 2 -- net/smc/smc_diag.c | 2 -- net/smc/smc_llc.c | 3 --- net/smc/smc_tx.c | 4 +--- net/smc/smc_wr.c | 5 ----- 6 files changed, 4 insertions(+), 26 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 33ed59e2996c..0cc0045c4eb1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -767,8 +767,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; lnk->link_idx = link_idx; - lnk->link_down_cnt_smc = 0; - lnk->link_down_cnt_ib = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); @@ -1074,20 +1072,16 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) { - ++to_lnk->link_down_cnt_smc; + if (rc) goto err_out; - } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) { - ++to_lnk->link_down_cnt_ib; + if (rc) goto err_out; - } goto again; } read_unlock_bh(&lgr->conns_lock); @@ -1698,10 +1692,8 @@ void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && - lnk->smcibdev == smcibdev && lnk->ibport == ibport) { - ++lnk->link_down_cnt_ib; + lnk->smcibdev == smcibdev && lnk->ibport == ibport) smcr_link_down_cond_sched(lnk); - } } } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index d1a360b8cb6e..ebca96fe3a2b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -148,8 +148,6 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ - u64 link_down_cnt_smc; /* smc-caused link down counter */ - u64 link_down_cnt_ib; /* ib-caused link down counter */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index ddecd39aa4a4..1fa7c7cf9332 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -155,8 +155,6 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .role = smc->conn.lgr->role, .lnk[0].ibport = smc->conn.lnk->ibport, .lnk[0].link_id = smc->conn.lnk->link_id, - .lnk[0].link_down_cnt_smc = smc->conn.lnk->link_down_cnt_smc, - .lnk[0].link_down_cnt_ib = smc->conn.lnk->link_down_cnt_ib, }; memcpy(linfo.lnk[0].ibname, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1be23778c6e3..1d8dafa1a35e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1281,14 +1281,12 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_NO_ASYM_NEEDED); if (rc) { - ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, SMC_LLC_DELETE_LINK); if (!qentry) { - ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } @@ -2102,7 +2100,6 @@ static void smc_llc_testlink_work(struct work_struct *work) if (!smc_link_active(link)) return; /* link state changed */ if (rc <= 0) { - ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); return; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index efe0b393a5fe..82735741bc2a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -304,10 +304,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - ++link->link_down_cnt_ib; + if (rc) smcr_link_down_cond_sched(link); - } return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index c1a8c267975f..24be1d03fef9 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -125,7 +125,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2)); } - ++link->link_down_cnt_ib; /* terminate link */ smcr_link_down_cond_sched(link); } @@ -220,7 +219,6 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { - ++link->link_down_cnt_smc; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -330,7 +328,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); } return rc; @@ -387,7 +384,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (atomic_dec_and_test(&link->wr_reg_refcnt)) wake_up_all(&link->wr_reg_wait); if (!rc) { - ++link->link_down_cnt_ib; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -468,7 +464,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: - ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); break; default: -- Gitee From f8e9231a7c9e54ad239a6f74b5f3fac2eee85d5a Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:21 +0800 Subject: [PATCH 048/148] Revert "anolis: net/smc: Add TX and RX diagnosis information" This reverts commit 09253fad15607ca3e0aac70e8c68674c6d040b5a. --- net/smc/smc.h | 6 ------ net/smc/smc_core.c | 15 --------------- net/smc/smc_diag.c | 6 ------ net/smc/smc_rx.c | 2 -- net/smc/smc_tx.c | 9 ++------- 5 files changed, 2 insertions(+), 36 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 449ccf6d7dd2..e897237c61f8 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -215,12 +215,6 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ - u64 rx_cnt; /* rx counter */ - u64 tx_cnt; /* tx counter */ - u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - u64 rx_bytes; /* rx size */ - u64 tx_bytes; /* tx size */ - u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 out_of_sync : 1; /* out of sync with peer */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0cc0045c4eb1..7c543e49ae11 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1797,20 +1797,6 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } -static void smc_rx_tx_counter_init(struct smc_connection *conn) -{ - /* Initialize RX & TX diagnostic inform for each - * connection. These counters mean what smc wants - * net devices "TODO" insead of what has been "DONE" - */ - conn->rx_cnt = 0; - conn->tx_cnt = 0; - conn->tx_corked_cnt = 0; - conn->rx_bytes = 0; - conn->tx_bytes = 0; - conn->tx_corked_bytes = 0; -} - /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1888,7 +1874,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); - smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 1fa7c7cf9332..40036e9926e0 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,12 +136,6 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, - .rx_cnt = conn->rx_cnt, - .tx_cnt = conn->tx_cnt, - .tx_corked_cnt = conn->tx_corked_cnt, - .rx_bytes = conn->rx_bytes, - .tx_bytes = conn->tx_bytes, - .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index bf353c68323d..51e8eb2933ff 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -392,7 +392,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); - conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -442,7 +441,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); - ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 82735741bc2a..02d147bde78c 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -239,19 +239,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) { + (conn->sndbuf_desc->len >> 1))) /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ - conn->tx_corked_bytes += copylen; - ++conn->tx_corked_cnt; queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_CORK_DELAY); - } else { - conn->tx_bytes += copylen; - ++conn->tx_cnt; + else smc_tx_sndbuf_nonempty(conn); - } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From f4d9f8cb46e50f4da0d4d46a632ed6cbe21c3943 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:29 +0800 Subject: [PATCH 049/148] Revert "anolis: net/smc: Introduce TCP to SMC replacement netlink commands" This reverts commit b532031d2f0da32f5e399d0c1bd5dc846cb58e39. --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 14 +--- net/smc/smc_conv.c | 186 ------------------------------------------ net/smc/smc_conv.h | 22 ----- net/smc/smc_netlink.c | 19 +---- net/smc/smc_netlink.h | 5 -- 6 files changed, 4 insertions(+), 244 deletions(-) delete mode 100644 net/smc/smc_conv.c delete mode 100644 net/smc/smc_conv.h diff --git a/net/smc/Makefile b/net/smc/Makefile index 72b3c934e473..19076ff20d58 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o smc_conv.o +smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3408f807b613..9baeb0baf659 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,7 +53,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_proc.h" -#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -2944,22 +2943,16 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_conv_init(); - if (rc) { - pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); - goto out_proc; - } - rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_conv; + goto out_proc; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_conv; + goto out_proc; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -2977,8 +2970,6 @@ static int __init smc_init(void) static_branch_enable(&tcp_have_smc); return 0; -out_conv: - smc_conv_exit(); out_proc: smc_proc_exit(); out_sock: @@ -3007,7 +2998,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); - smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c deleted file mode 100644 index e1f87d1de8a5..000000000000 --- a/net/smc/smc_conv.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include -#include "smc_netlink.h" -#include "smc_conv.h" - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *wlist_elem, *tmp; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - if (*wlist_len >= SMC_MAX_WLIST_LEN) { - mutex_unlock(wlist_lock); - return -EINVAL; - } - - list_for_each_entry(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) - goto out; - } - - wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); - if (!wlist_elem) { - mutex_unlock(wlist_lock); - return -ENOMEM; - } - - strcpy(wlist_elem->task_comm, msg); - list_add_tail_rcu(&wlist_elem->list, wlist); - ++*wlist_len; -out: - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *tmp, *nxt; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - list_for_each_entry_safe(tmp, nxt, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) { - list_del_rcu(&tmp->list); - synchronize_rcu(); - kfree(tmp); - --*wlist_len; - break; - } - } - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); - struct smc_conv_wlist_elem *tmp; - void *nlh; - - if (cb_ctx->pos[0]) - goto errmsg; - - nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &smc_gen_nl_family, NLM_F_MULTI, - SMC_NETLINK_GET_TCP2SMC_WLIST); - if (!nlh) - goto errmsg; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, - nla_total_size(strlen(tmp->task_comm) + 1), - tmp->task_comm)) { - rcu_read_unlock(); - goto errattr; - } - } - rcu_read_unlock(); - - genlmsg_end(skb, nlh); - cb_ctx->pos[0] = 1; - return skb->len; - -errattr: - genlmsg_cancel(skb, nlh); -errmsg: - return skb->len; -} - -static int smc_match_tcp2smc_wlist(struct net *net, char *comm) -{ - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_conv_wlist_elem *tmp; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, comm)) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - return -1; -} - -static int __net_init smc_net_conv_init(struct net *net) -{ - INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); - net->smc.smc_conv.wlist_len = 0; - - mutex_init(&net->smc.smc_conv.wlist_lock); - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, - smc_match_tcp2smc_wlist); - return 0; -} - -static void __net_exit smc_net_conv_exit(struct net *net) -{ - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *cur, *nxt; - struct list_head tmp_list; - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); - synchronize_rcu(); - - INIT_LIST_HEAD(&tmp_list); - - mutex_lock(wlist_lock); - list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); - *wlist_len = 0; - mutex_unlock(wlist_lock); - - list_for_each_entry_safe(cur, nxt, &tmp_list, list) { - list_del(&cur->list); - kfree(cur); - } -} - -static struct pernet_operations smc_conv_ops = { - .init = smc_net_conv_init, - .exit = smc_net_conv_exit, -}; - -int __init smc_conv_init(void) -{ - return register_pernet_subsys(&smc_conv_ops); -} - -void smc_conv_exit(void) -{ - unregister_pernet_subsys(&smc_conv_ops); -} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h deleted file mode 100644 index 1615b27feede..000000000000 --- a/net/smc/smc_conv.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef NET_SMC_SMC_CONV_H_ -#define NET_SMC_SMC_CONV_H_ -#include -#include -#include - -#define SMC_MAX_WLIST_LEN 32 - -struct smc_conv_wlist_elem { - char task_comm[TASK_COMM_LEN]; - struct list_head list; -}; - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); -int __init smc_conv_init(void); -void smc_conv_exit(void); - -#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index f2007aa124cf..f13ab0661ed5 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,7 +22,6 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" -#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -112,25 +111,9 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_seid, }, - { - .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_add_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_del_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .dumpit = smc_nl_get_tcp2smc_wlist, - }, }; -static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { - [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, +static const struct nla_policy smc_gen_nl_policy[2] = { [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index aae13737095e..e8c6c3f0e98c 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,11 +15,6 @@ #include #include -enum { - SMC_CMD_ATTR_TCP2SMC = 1, - SMC_CMD_MAX_ATTR, -}; - extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; -- Gitee From 4582bc5fdf63911ec90ccc4da4be7e88e7610a71 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:38 +0800 Subject: [PATCH 050/148] Revert "anolis: net/smc: Introduce SMC-R-related proc files" This reverts commit 86b611eef9e6b3e7d516000b21cca0da14bd5d4c. --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 29 ++--- net/smc/smc_diag.c | 29 +++-- net/smc/smc_proc.c | 287 --------------------------------------------- net/smc/smc_proc.h | 34 ------ 5 files changed, 22 insertions(+), 359 deletions(-) delete mode 100644 net/smc/smc_proc.c delete mode 100644 net/smc/smc_proc.h diff --git a/net/smc/Makefile b/net/smc/Makefile index 19076ff20d58..640af9a39f9c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o +smc-y += smc_tracepoint.o smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9baeb0baf659..71bbbd526137 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -52,7 +52,6 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" -#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -87,13 +86,11 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - write_lock_bh(&h->lock); - - head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + head = &h->ht; + write_lock_bh(&h->lock); sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock_bh(&h->lock); return 0; @@ -2864,7 +2861,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc, i; + int rc; int max_rshare, max_wshare; unsigned long limit; @@ -2931,28 +2928,19 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - - for (i = 0; i < SMC_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); - } - - rc = smc_proc_init(); - if (rc) { - pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); - goto out_sock; - } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_proc; + goto out_sock; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_proc; + goto out_sock; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -2970,8 +2958,6 @@ static int __init smc_init(void) static_branch_enable(&tcp_have_smc); return 0; -out_proc: - smc_proc_exit(); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -2998,7 +2984,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); - smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 40036e9926e0..c952986a6aca 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,25 +196,24 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0, slot; + int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - - for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { - head = &prot->h.smc_hash->ht[slot]; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + head = &prot->h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; - } + num++; } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c deleted file mode 100644 index 19d8cc82a7ac..000000000000 --- a/net/smc/smc_proc.c +++ /dev/null @@ -1,287 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include "smc.h" -#include "smc_proc.h" -#include "smc_core.h" - -static void *smc_get_next(struct seq_file *seq, void *cur) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - struct net *net = seq_file_net(seq); - struct hlist_head *head; - struct sock *sk = cur; - - if (!sk) { - read_lock(&smc_hash->lock); -get_head: - head = &smc_hash->ht[sp->bucket]; - sk = sk_head(head); - sp->offset = 0; - goto get_sk; - } - ++sp->num; - ++sp->offset; - - sk = sk_next(sk); -get_sk: - sk_for_each_from(sk) { - if (!net_eq(sock_net(sk), net)) - continue; - return sk; - } - sp->offset = 0; - if (++sp->bucket < SMC_HTABLE_SIZE) - goto get_head; - - read_unlock(&smc_hash->lock); - return NULL; -} - -static void *smc_seek_last_pos(struct seq_file *seq) -{ - struct smc_proc_private *sp = seq->private; - int offset = sp->offset; - int orig_num = sp->num; - void *rc = NULL; - - if (sp->bucket >= SMC_HTABLE_SIZE) - goto out; - - rc = smc_get_next(seq, NULL); - while (offset-- && rc) - rc = smc_get_next(seq, rc); - - if (rc) - goto out; - - sp->bucket = 0; -out: - sp->num = orig_num; - return rc; -} - -static void *smc_get_idx(struct seq_file *seq, loff_t pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - sp->bucket = 0; - rc = smc_get_next(seq, NULL); - - while (rc && pos) { - rc = smc_get_next(seq, rc); - --pos; - } - return rc; -} - -static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - if (*pos && *pos == sp->last_pos) { - rc = smc_seek_last_pos(seq); - if (rc) - goto out; - } - - sp->num = 0; - sp->bucket = 0; - sp->offset = 0; - sp->protocol = protocol; - rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - -out: - sp->last_pos = *pos; - return rc; -} - -static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC); -} - -static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC6); -} - -static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) -{ - struct smc_proc_private *sp = seq->private; - const struct in6_addr *dest, *src; - struct smc_link_group *lgr; - struct socket *clcsock; - struct smc_link *lnk; - struct sock *sk; - bool fb = false; - int i; - - fb = smc->use_fallback; - clcsock = smc->clcsock; - sk = &smc->sk; - - if (protocol == SMCPROTO_SMC) - seq_printf(seq, CONN4_ADDR_FM, sp->num, - clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, - clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); - else if (protocol == SMCPROTO_SMC6) { - dest = &clcsock->sk->sk_v6_daddr; - src = &clcsock->sk->sk_v6_rcv_saddr; - seq_printf(seq, CONN6_ADDR_FM, sp->num, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); - } - - seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, - sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); - - lgr = smc->conn.lgr; - lnk = smc->conn.lnk; - - if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { - for (i = 0; i < SMC_LGR_ID_SIZE; i++) - seq_printf(seq, "%02X", lgr->id[i]); - - seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', - lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); - } else { - seq_puts(seq, "- - - - - - - -\n"); - } -} - -static int smc_conn_show(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct socket *clcsock; - struct smc_sock *smc; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, - "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", - "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); - goto out; - } - - smc = smc_sk(v); - clcsock = smc->clcsock; - if (!clcsock) - goto out; - - _conn_show(seq, smc, sp->protocol); -out: - return 0; -} - -static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc = NULL; - - if (v == SEQ_START_TOKEN) { - rc = smc_get_idx(seq, 0); - goto out; - } - rc = smc_get_next(seq, v); -out: - ++*pos; - sp->last_pos = *pos; - return rc; -} - -static void smc_conn_stop(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - - if (v && v != SEQ_START_TOKEN) - read_unlock(&smc_hash->lock); -} - -static struct smc_proc_entry smc_proc[] = { - { - .name = "smc4", - .ops = { - .show = smc_conn_show, - .start = smc_conn4_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .name = "smc6", - .ops = { - .show = smc_conn_show, - .start = smc_conn6_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#endif -}; - -static int __net_init smc_proc_dir_init(struct net *net) -{ - int i, rc = -ENOMEM; - - net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); - if (!net->proc_net_smc) - goto err; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { - if (!proc_create_net_data(smc_proc[i].name, 0444, - net->proc_net_smc, &smc_proc[i].ops, - sizeof(struct smc_proc_private), - NULL)) - goto err_entry; - } - - return 0; - -err_entry: - for (i -= 1; i >= 0; i--) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -err: - return rc; -} - -static void __net_exit smc_proc_dir_exit(struct net *net) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -} - -static struct pernet_operations smc_proc_ops = { - .init = smc_proc_dir_init, - .exit = smc_proc_dir_exit, -}; - -int __init smc_proc_init(void) -{ - return register_pernet_subsys(&smc_proc_ops); -} - -void smc_proc_exit(void) -{ - unregister_pernet_subsys(&smc_proc_ops); -} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h deleted file mode 100644 index ec59ca03e163..000000000000 --- a/net/smc/smc_proc.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _SMC_PROC_H_ -#define _SMC_PROC_H_ - -#include -#include -#include -#include -#include -#include "smc.h" - -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") -#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") - -struct smc_proc_private { - struct seq_net_private p; - int num, bucket, offset; - int protocol; - loff_t last_pos; -}; - -struct smc_proc_entry { - const char *name; - const struct seq_operations ops; -}; - -int __init smc_proc_init(void); -void smc_proc_exit(void); - -#endif -- Gitee From 02191b6587f617d915d382ca6b7dea1c36b0adfd Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:46 +0800 Subject: [PATCH 051/148] Revert "anolis: net/smc: Introduce sysctl tcp2smc" This reverts commit a18362bbb4f31020db64887205895544356fa183. --- net/smc/af_smc.c | 3 --- net/smc/smc_sysctl.c | 7 ------- 2 files changed, 10 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 71bbbd526137..91ef76230041 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2825,7 +2825,6 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; - net->smc.sysctl_tcp2smc = 0; } return smc_pnet_net_init(net); @@ -2833,7 +2832,6 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { - net->smc.sysctl_tcp2smc = 0; smc_pnet_net_exit(net); } @@ -2949,7 +2947,6 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; - init_net.smc.sysctl_tcp2smc = 0; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e3942837c3e3..317b37095c4f 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -28,13 +28,6 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, - { - .procname = "tcp2smc", - .data = &init_net.smc.sysctl_tcp2smc, - .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; -- Gitee From b1b2479c88335d94cd5a9ffed0a1bd11ce1bf73b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:07:53 +0800 Subject: [PATCH 052/148] Revert "anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace" This reverts commit 8467b42882dc28c0e8b2bdaf059e4efce8280756. --- net/smc/smc.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/smc/smc.h b/net/smc/smc.h index e897237c61f8..2a8a54ad8b19 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,6 +21,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 + +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ + #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From 1f1b938f6d253aa3816800ba13d61d42622ea832 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:08:02 +0800 Subject: [PATCH 053/148] Revert "anolis: net/smc: Introduce tunable sysctls for sndbuf and RMB size" This reverts commit a8ebaa18dba43b1d084d093e5404f739fdd74634. --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 29 ++-------------- net/smc/smc.h | 5 --- net/smc/smc_sysctl.c | 81 -------------------------------------------- 4 files changed, 4 insertions(+), 113 deletions(-) delete mode 100644 net/smc/smc_sysctl.c diff --git a/net/smc/Makefile b/net/smc/Makefile index 640af9a39f9c..196fb6f01b14 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o +smc-y += smc_tracepoint.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 91ef76230041..4ef0f71c7548 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -246,8 +245,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - sk->sk_sndbuf = net->smc.sysctl_wmem_default; - sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -2742,6 +2739,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + out: return rc; } @@ -2820,13 +2820,6 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { - if (net != &init_net) { - net->smc.sysctl_wmem_default = - init_net.smc.sysctl_rmem_default; - net->smc.sysctl_rmem_default = - init_net.smc.sysctl_rmem_default; - } - return smc_pnet_net_init(net); } @@ -2860,8 +2853,6 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { int rc; - int max_rshare, max_wshare; - unsigned long limit; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -2941,17 +2932,6 @@ static int __init smc_init(void) goto out_sock; } - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL * 1024 * 1024, limit); - max_rshare = min(6UL * 1024 * 1024, limit); - - init_net.smc.sysctl_wmem_default = 256 * 1024; - init_net.smc.sysctl_rmem_default = 384 * 1024; - -#ifdef CONFIG_SYSCTL - smc_sysctl_init(); -#endif - static_branch_enable(&tcp_have_smc); return 0; @@ -2993,9 +2973,6 @@ static void __exit smc_exit(void) smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); -#ifdef CONFIG_SYSCTL - smc_sysctl_exit(); -#endif rcu_barrier(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index 2a8a54ad8b19..1a4fc1c6c4ab 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -312,9 +312,4 @@ void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); -#ifdef CONFIG_SYSCTL -int smc_sysctl_init(void); -void smc_sysctl_exit(void); -#endif - #endif /* __SMC_H */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c deleted file mode 100644 index 317b37095c4f..000000000000 --- a/net/smc/smc_sysctl.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include - -#include "smc_core.h" - -static int min_sndbuf = SMC_BUF_MIN_SIZE; -static int min_rcvbuf = SMC_BUF_MIN_SIZE; - -static struct ctl_table smc_table[] = { - { - .procname = "wmem_default", - .data = &init_net.smc.sysctl_wmem_default, - .maxlen = sizeof(init_net.smc.sysctl_wmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &init_net.smc.sysctl_rmem_default, - .maxlen = sizeof(init_net.smc.sysctl_rmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { } -}; - -static __net_init int smc_sysctl_init_net(struct net *net) -{ - struct ctl_table *table; - - table = smc_table; - if (!net_eq(net, &init_net)) { - int i; - - table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); - if (!table) - goto err_alloc; - - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) - table[i].data += (void *)net - (void *)&init_net; - } - - net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); - if (!net->smc.smc_hdr) - goto err_reg; - - return 0; - -err_reg: - if (!net_eq(net, &init_net)) - kfree(table); -err_alloc: - return -ENOMEM; -} - -static __net_exit void smc_sysctl_exit_net(struct net *net) -{ - unregister_net_sysctl_table(net->smc.smc_hdr); -} - -static struct pernet_operations smc_sysctl_ops __net_initdata = { - .init = smc_sysctl_init_net, - .exit = smc_sysctl_exit_net, -}; - -int __init smc_sysctl_init(void) -{ - return register_pernet_subsys(&smc_sysctl_ops); -} - -void smc_sysctl_exit(void) -{ - unregister_pernet_subsys(&smc_sysctl_ops); -} -- Gitee From 7fe578a0fa380f1a3976517c216969a3324d677f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:43:37 +0800 Subject: [PATCH 054/148] Revert "net/smc: add comments for smc_link_{usable|sendable}" This reverts commit d782e377776a248a8a11782ec101cdfe139b3701. --- net/smc/smc_core.h | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index ebca96fe3a2b..5e8e37ec863b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -410,13 +410,7 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } -/* - * Returns true if the specified link is usable. - * - * usable means the link is ready to receive RDMA messages, map memory - * on the link, etc. This doesn't ensure we are able to send RDMA messages - * on this link, if sending RDMA messages is needed, use smc_link_sendable() - */ +/* returns true if the specified link is usable */ static inline bool smc_link_usable(struct smc_link *lnk) { if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) @@ -424,15 +418,6 @@ static inline bool smc_link_usable(struct smc_link *lnk) return true; } -/* - * Returns true if the specified link is ready to receive AND send RDMA - * messages. - * - * For the client side in first contact, the underlying QP may still in - * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() - * is not strong enough. For those places that need to send any CDC or LLC - * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead - */ static inline bool smc_link_sendable(struct smc_link *lnk) { return smc_link_usable(lnk) && -- Gitee From 84da0ed9bde0417fe466cce29a0f6894f73b9a3b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:43:45 +0800 Subject: [PATCH 055/148] Revert "net/smc: remove redundant re-assignment of pointer link" This reverts commit 742b1227716047b3aed12541d21317c7e24c7efd. --- net/smc/smc_clc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 6be95a2a7b25..8409ab71a5e4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1021,6 +1021,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, struct smc_link *link = conn->lnk; /* SMC-R specific settings */ + link = conn->lnk; memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_R; -- Gitee From a4657bae33a384e9b3e3bdc5818a0ef6d9e48287 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 3 Mar 2022 17:43:53 +0800 Subject: [PATCH 056/148] Revert "net/smc: Introduce TCP ULP support" This reverts commit 4d2f9f07c8e52bd4279c5bf96825f0da3a631e71. --- net/smc/af_smc.c | 93 ++++-------------------------------------------- 1 file changed, 7 insertions(+), 86 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4ef0f71c7548..c1583df1cd3f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2700,8 +2700,8 @@ static const struct proto_ops smc_sock_ops = { .splice_read = smc_splice_read, }; -static int __smc_create(struct net *net, struct socket *sock, int protocol, - int kern, struct socket *clcsock) +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; @@ -2726,19 +2726,12 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; - - rc = 0; - if (!clcsock) { - rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, - &smc->clcsock); - if (rc) { - sk_common_release(sk); - goto out; - } - } else { - smc->clcsock = clcsock; + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) { + sk_common_release(sk); + goto out; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); @@ -2746,76 +2739,12 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, return rc; } -static int smc_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - return __smc_create(net, sock, protocol, kern, NULL); -} - static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; -static int smc_ulp_init(struct sock *sk) -{ - struct socket *tcp = sk->sk_socket; - struct net *net = sock_net(sk); - struct socket *smcsock; - int protocol, ret; - - /* only TCP can be replaced */ - if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || - (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) - return -ESOCKTNOSUPPORT; - /* don't handle wq now */ - if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) - return -ENOTCONN; - - if (sk->sk_family == AF_INET) - protocol = SMCPROTO_SMC; - else - protocol = SMCPROTO_SMC6; - - smcsock = sock_alloc(); - if (!smcsock) - return -ENFILE; - - smcsock->type = SOCK_STREAM; - __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ - ret = __smc_create(net, smcsock, protocol, 1, tcp); - if (ret) { - sock_release(smcsock); /* module_put() which ops won't be NULL */ - return ret; - } - - /* replace tcp socket to smc */ - smcsock->file = tcp->file; - smcsock->file->private_data = smcsock; - smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ - smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ - tcp->file = NULL; - - return ret; -} - -static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, - const gfp_t priority) -{ - struct inet_connection_sock *icsk = inet_csk(newsk); - - /* don't inherit ulp ops to child when listen */ - icsk->icsk_ulp_ops = NULL; -} - -static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { - .name = "smc", - .owner = THIS_MODULE, - .init = smc_ulp_init, - .clone = smc_ulp_clone, -}; - unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) @@ -2926,12 +2855,6 @@ static int __init smc_init(void) goto out_sock; } - rc = tcp_register_ulp(&smc_ulp_ops); - if (rc) { - pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_sock; - } - static_branch_enable(&tcp_have_smc); return 0; @@ -2960,7 +2883,6 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); - tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); @@ -2983,4 +2905,3 @@ MODULE_AUTHOR("Ursula Braun "); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); -MODULE_ALIAS_TCP_ULP("smc"); -- Gitee From 0c0dc8bd2c5d0e1d5b36d2b65bcdd199422b4ab9 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:24:52 +0800 Subject: [PATCH 057/148] Revert "net/smc: Introduce net namespace support for linkgroup" This reverts commit 90589de564189cfe44d5d20fdd217e45fbc02eac. --- net/smc/smc_core.c | 24 +++++++----------------- net/smc/smc_core.h | 2 -- net/smc/smc_ib.h | 7 ------- net/smc/smc_pnet.c | 21 +++++---------------- 4 files changed, 12 insertions(+), 42 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7c543e49ae11..3c73a665d110 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -916,7 +916,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_wr_free_lgr_mem(lgr); goto free_wq; } - lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); @@ -1605,8 +1604,7 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || - lgr->type == SMC_LGR_ASYMMETRIC_PEER || - !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) + lgr->type == SMC_LGR_ASYMMETRIC_PEER) continue; /* trigger local add link processing */ @@ -1764,10 +1762,8 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, u8 peer_systemid[], u8 peer_gid[], u8 peer_mac_v1[], - enum smc_lgr_role role, u32 clcqpn, - struct net *net) + enum smc_lgr_role role, u32 clcqpn) { - struct smc_link *lnk; int i; if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || @@ -1775,17 +1771,12 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lnk = &lgr->lnk[i]; - - if (!smc_link_active(lnk)) + if (!smc_link_active(&lgr->lnk[i])) continue; - /* use verbs API to check netns, instead of lgr->net */ - if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) - return false; - if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && - !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && + if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && + !memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) && (smcr_version == SMC_V2 || - !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) + !memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; @@ -1801,7 +1792,6 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; - struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; @@ -1828,7 +1818,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, - ini->ib_clcqpn, net)) && + ini->ib_clcqpn)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5e8e37ec863b..f2a12d79793f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -307,8 +307,6 @@ struct smc_link_group { u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; - /* net namespace */ - struct net *net; }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5d8b49c57f50..bfa1c6bf6313 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -69,13 +69,6 @@ static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) return cpu_to_be32(INADDR_NONE); } -static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev) -{ - if (smcibdev && smcibdev->ibdev) - return read_pnet(&smcibdev->ibdev->coredev.rdma_net); - return NULL; -} - struct smc_init_info_smcrv2; struct smc_buf_desc; struct smc_link; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 13df00306182..fb1952478ac8 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -977,16 +977,14 @@ static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, - struct smc_ib_device *known_dev, - struct net *net) + struct smc_ib_device *known_dev) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - if (ibdev == known_dev || - !rdma_dev_access_netns(ibdev->ibdev, net)) + if (ibdev == known_dev) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) @@ -1003,14 +1001,12 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, mutex_unlock(&smc_ib_devices.mutex); } -/* find alternate roce device with same pnet_id, vlan_id and net namespace */ +/* find alternate roce device with same pnet_id and vlan_id */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { - struct net *net = lgr->net; - - _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev); } /* if handshake network device belongs to a roce device, return its @@ -1019,7 +1015,6 @@ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { - struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); @@ -1027,10 +1022,6 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct net_device *ndev; int i; - /* check rdma net namespace */ - if (!rdma_dev_access_netns(ibdev->ibdev, net)) - continue; - for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; @@ -1061,17 +1052,15 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct net *net; ndev = pnet_find_base_ndev(ndev); - net = dev_net(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, -- Gitee From 859ffd71c579fd362205e0640b6563bae7cf2429 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:24:59 +0800 Subject: [PATCH 058/148] Revert "net/smc: Use the bitmap API when applicable" This reverts commit c6753e0800130fb71678c9d22f1d50d3ba277ce2. --- net/smc/smc_wr.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 24be1d03fef9..c6cfdea8b71b 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -54,7 +54,11 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /* returns true if at least one tx work request is pending on the given link */ static inline bool smc_wr_is_tx_pend(struct smc_link *link) { - return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt); + if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != + link->wr_tx_cnt) { + return true; + } + return false; } /* wait till all pending tx work requests on the given link are completed */ @@ -692,7 +696,7 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); lnk->wr_tx_pends = NULL; - bitmap_free(lnk->wr_tx_mask); + kfree(lnk->wr_tx_mask); lnk->wr_tx_mask = NULL; kfree(lnk->wr_tx_sges); lnk->wr_tx_sges = NULL; @@ -768,7 +772,9 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); + link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT), + sizeof(*link->wr_tx_mask), + GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, @@ -881,7 +887,8 @@ int smc_wr_create_link(struct smc_link *lnk) goto dma_unmap; } smc_wr_init_sge(lnk); - bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); init_waitqueue_head(&lnk->wr_tx_wait); atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); -- Gitee From c1edb6452cff9a43beb6f8232f1001507ce61dc1 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:03 +0800 Subject: [PATCH 059/148] Revert "net/smc: fix kernel panic caused by race of smc_sock" This reverts commit 3a036169740d05f5a333cdcabd547bc07036f93f. --- net/smc/smc.h | 5 ----- net/smc/smc_cdc.c | 52 +++++++++++++++++++++++++--------------------- net/smc/smc_cdc.h | 2 +- net/smc/smc_core.c | 25 +++++----------------- net/smc/smc_ib.c | 4 ++-- net/smc/smc_ib.h | 1 - net/smc/smc_wr.c | 41 +++++++++++++++++++++++++++++++++--- net/smc/smc_wr.h | 3 ++- 8 files changed, 76 insertions(+), 57 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 1a4fc1c6c4ab..f4286ca1f228 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -180,11 +180,6 @@ struct smc_connection { u16 tx_cdc_seq; /* sequence # for CDC send */ u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ - atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe - * - inc when post wqe, - * - dec on polled tx cqe - */ - wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 84c8a4374fdd..99acd337ba90 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -31,6 +31,10 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, struct smc_sock *smc; int diff; + if (!conn) + /* already dismissed */ + return; + smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { @@ -47,12 +51,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn); conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && - unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); - WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); - smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } @@ -109,10 +107,6 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); - - atomic_inc(&conn->cdc_pend_tx_wr); - smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ - rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); @@ -120,7 +114,6 @@ int smc_cdc_msg_send(struct smc_connection *conn, } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - atomic_dec(&conn->cdc_pend_tx_wr); } return rc; @@ -143,18 +136,7 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, peer->token = htonl(local->token); peer->prod_flags.failover_validation = 1; - /* We need to set pend->conn here to make sure smc_cdc_tx_handler() - * can handle properly - */ - smc_cdc_add_pending_send(conn, pend); - - atomic_inc(&conn->cdc_pend_tx_wr); - smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ - rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (unlikely(rc)) - atomic_dec(&conn->cdc_pend_tx_wr); - return rc; } @@ -211,9 +193,31 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) return rc; } -void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) +static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, + unsigned long data) { - wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); + struct smc_connection *conn = (struct smc_connection *)data; + struct smc_cdc_tx_pend *cdc_pend = + (struct smc_cdc_tx_pend *)tx_pend; + + return cdc_pend->conn == conn; +} + +static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) +{ + struct smc_cdc_tx_pend *cdc_pend = + (struct smc_cdc_tx_pend *)tx_pend; + + cdc_pend->conn = NULL; +} + +void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) +{ + struct smc_link *link = conn->lnk; + + smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, + smc_cdc_tx_filter, smc_cdc_tx_dismisser, + (unsigned long)conn); } /* Send a SMC-D CDC header. diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303..0a0a89abd38b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -291,7 +291,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); -void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn); +void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3c73a665d110..343afbdafb98 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1152,7 +1152,7 @@ void smc_conn_free(struct smc_connection *conn) smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { - smc_cdc_wait_pend_tx_wr(conn); + smc_cdc_tx_dismiss_slots(conn); if (current_work() != &conn->abort_work) cancel_work_sync(&conn->abort_work); } @@ -1229,7 +1229,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); - smc_ib_modify_qp_error(lnk); + smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); @@ -1361,7 +1361,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) else tasklet_unlock_wait(&conn->rx_tsklet); } else { - smc_cdc_wait_pend_tx_wr(conn); + smc_cdc_tx_dismiss_slots(conn); } smc_lgr_unregister_conn(conn); smc_close_active_abort(smc); @@ -1484,16 +1484,11 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd) /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. - * - * We must wait here for QPs been destroyed before we destroy the CQs, - * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus - * smc_sock cannot be released. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); - LIST_HEAD(lgr_linkdown_list); int i; spin_lock_bh(&smc_lgr_list.lock); @@ -1505,7 +1500,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) - list_move_tail(&lgr->list, &lgr_linkdown_list); + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -1517,16 +1512,6 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) __smc_lgr_terminate(lgr, false); } - list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].smcibdev == smcibdev) { - mutex_lock(&lgr->llc_conf_mutex); - smcr_link_down_cond(&lgr->lnk[i]); - mutex_unlock(&lgr->llc_conf_mutex); - } - } - } - if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, @@ -1626,6 +1611,7 @@ static void smcr_link_down(struct smc_link *lnk) if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) return; + smc_ib_modify_qp_reset(lnk); to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ smcr_link_clear(lnk, true); @@ -1863,7 +1849,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; - init_waitqueue_head(&conn->cdc_pend_tx_wq); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 65bf38cac7fd..5351fe1a167e 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -109,12 +109,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk) IB_QP_MAX_QP_RD_ATOMIC); } -int smc_ib_modify_qp_error(struct smc_link *lnk) +int smc_ib_modify_qp_reset(struct smc_link *lnk) { struct ib_qp_attr qp_attr; memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.qp_state = IB_QPS_ERR; + qp_attr.qp_state = IB_QPS_RESET; return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index bfa1c6bf6313..07585937370e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -90,7 +90,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); -int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index c6cfdea8b71b..df1dc225cbab 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -62,9 +62,13 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link) } /* wait till all pending tx work requests on the given link are completed */ -void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) { - wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); + if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), + SMC_WR_TX_WAIT_PENDING_TIME)) + return 0; + else /* timeout */ + return -EPIPE; } static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) @@ -83,6 +87,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) struct smc_wr_tx_pend pnd_snd; struct smc_link *link; u32 pnd_snd_idx; + int i; link = wc->qp->qp_context; @@ -123,6 +128,14 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (wc->status) { + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + /* clear full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[i], 0, + sizeof(link->wr_tx_pends[i])); + memset(&link->wr_tx_bufs[i], 0, + sizeof(link->wr_tx_bufs[i])); + clear_bit(i, link->wr_tx_mask); + } if (link->lgr->smc_version == SMC_V2) { memset(link->wr_tx_v2_pend, 0, sizeof(*link->wr_tx_v2_pend)); @@ -408,6 +421,25 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) return rc; } +void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, + smc_wr_tx_filter filter, + smc_wr_tx_dismisser dismisser, + unsigned long data) +{ + struct smc_wr_tx_pend_priv *tx_pend; + struct smc_wr_rx_hdr *wr_tx; + int i; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; + if (wr_tx->type != wr_tx_hdr_type) + continue; + tx_pend = &link->wr_tx_pends[i].priv; + if (filter(tx_pend, data)) + dismisser(tx_pend); + } +} + /****************************** receive queue ********************************/ int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) @@ -643,7 +675,10 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_wakeup_reg_wait(lnk); smc_wr_wakeup_tx_wait(lnk); - smc_wr_tx_wait_no_pending_sends(lnk); + if (smc_wr_tx_wait_no_pending_sends(lnk)) + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * + sizeof(*lnk->wr_tx_mask)); wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 47512ccce5ef..48ed9b08ac7a 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -22,6 +22,7 @@ #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) +#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ @@ -129,7 +130,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, unsigned long data); -void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); -- Gitee From cb87e95bf98546650480e00984644fc44eeeb796 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:05 +0800 Subject: [PATCH 060/148] Revert "net/smc: don't send CDC/LLC message if link not ready" This reverts commit 5caf09a2f394b3f41158929f1d151263012d29b9. --- net/smc/smc_core.c | 2 +- net/smc/smc_core.h | 6 ------ net/smc/smc_llc.c | 2 +- net/smc/smc_wr.c | 4 ++-- net/smc/smc_wr.h | 2 +- 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 343afbdafb98..368b0bc5064c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -647,7 +647,7 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (smc_link_sendable(lnk)) + if (smc_link_usable(lnk)) lnk->state = SMC_LNK_INACTIVE; } wake_up_all(&lgr->llc_msg_waiter); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f2a12d79793f..93e0e6c647ba 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -416,12 +416,6 @@ static inline bool smc_link_usable(struct smc_link *lnk) return true; } -static inline bool smc_link_sendable(struct smc_link *lnk) -{ - return smc_link_usable(lnk) && - lnk->qp_attr.cur_qp_state == IB_QPS_RTS; -} - static inline bool smc_link_active(struct smc_link *lnk) { return lnk->state == SMC_LNK_ACTIVE; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1d8dafa1a35e..b74342c8433e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1632,7 +1632,7 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) delllc.reason = htonl(rsn); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_sendable(&lgr->lnk[i])) + if (!smc_link_usable(&lgr->lnk[i])) continue; if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) break; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index df1dc225cbab..79a7431f534e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -188,7 +188,7 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) { *idx = link->wr_tx_cnt; - if (!smc_link_sendable(link)) + if (!smc_link_usable(link)) return -ENOLINK; for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) @@ -231,7 +231,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - !smc_link_sendable(link) || + !smc_link_usable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 48ed9b08ac7a..f353311e6f84 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -62,7 +62,7 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) static inline bool smc_wr_tx_link_hold(struct smc_link *link) { - if (!smc_link_sendable(link)) + if (!smc_link_usable(link)) return false; atomic_inc(&link->wr_tx_refcnt); return true; -- Gitee From 33883cd0c8af8d285406b5ed99c7d88802aa452f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:07 +0800 Subject: [PATCH 061/148] Revert "net/smc: fix using of uninitialized completions" This reverts commit a9f2c5588efe64da4bf6d2eeac1ae1095dd975c5. --- net/smc/smc_wr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 79a7431f534e..600ab5889227 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -358,20 +358,18 @@ int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout) { struct smc_wr_tx_pend *pend; - u32 pnd_idx; int rc; pend = container_of(priv, struct smc_wr_tx_pend, priv); pend->compl_requested = 1; - pnd_idx = pend->idx; - init_completion(&link->wr_tx_compl[pnd_idx]); + init_completion(&link->wr_tx_compl[pend->idx]); rc = smc_wr_tx_send(link, priv); if (rc) return rc; /* wait for completion by smc_wr_tx_process_cqe() */ rc = wait_for_completion_interruptible_timeout( - &link->wr_tx_compl[pnd_idx], timeout); + &link->wr_tx_compl[pend->idx], timeout); if (rc <= 0) rc = -ENODATA; if (rc > 0) -- Gitee From 0b5e91d44199a711600ecc79128a735505b33fd1 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:10 +0800 Subject: [PATCH 062/148] Revert "net/smc: Prevent smc_release() from long blocking" This reverts commit c684978752fcd9b7a9866e1627dae60c6f4db780. --- net/smc/af_smc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c1583df1cd3f..41eda98d153b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -194,9 +194,7 @@ static int smc_release(struct socket *sock) /* cleanup for a dangling non-blocking connect */ if (smc->connect_nonblock && sk->sk_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - - if (cancel_work_sync(&smc->connect_work)) - sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ + flush_work(&smc->connect_work); if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires -- Gitee From 748ed8327b88bc88abf6bec17d3d7ceb714e498b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:12 +0800 Subject: [PATCH 063/148] Revert "anolis: Revert "anolis: net/smc: Introduce tunable sysctls for sndbuf and RMB size"" This reverts commit 6ac1ddb82a72ace93a4cf6f38a617395139b2bd7. --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 28 +++++++++++++-- net/smc/smc.h | 5 +++ net/smc/smc_sysctl.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 net/smc/smc_sysctl.c diff --git a/net/smc/Makefile b/net/smc/Makefile index 196fb6f01b14..640af9a39f9c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 41eda98d153b..e7cb32904ba3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -243,6 +244,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + sk->sk_sndbuf = net->smc.sysctl_wmem_default; + sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -2730,8 +2733,6 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); out: return rc; @@ -2747,6 +2748,13 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { + if (net != &init_net) { + net->smc.sysctl_wmem_default = + init_net.smc.sysctl_rmem_default; + net->smc.sysctl_rmem_default = + init_net.smc.sysctl_rmem_default; + } + return smc_pnet_net_init(net); } @@ -2780,6 +2788,8 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { int rc; + int max_rshare, max_wshare; + unsigned long limit; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -2853,6 +2863,17 @@ static int __init smc_init(void) goto out_sock; } + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); + max_wshare = min(4UL * 1024 * 1024, limit); + max_rshare = min(6UL * 1024 * 1024, limit); + + init_net.smc.sysctl_wmem_default = 256 * 1024; + init_net.smc.sysctl_rmem_default = 384 * 1024; + +#ifdef CONFIG_SYSCTL + smc_sysctl_init(); +#endif + static_branch_enable(&tcp_have_smc); return 0; @@ -2893,6 +2914,9 @@ static void __exit smc_exit(void) smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); +#ifdef CONFIG_SYSCTL + smc_sysctl_exit(); +#endif rcu_barrier(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index f4286ca1f228..1505df7d98c4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -307,4 +307,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); +#ifdef CONFIG_SYSCTL +int smc_sysctl_init(void); +void smc_sysctl_exit(void); +#endif + #endif /* __SMC_H */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..317b37095c4f --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "smc_core.h" + +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; + +static struct ctl_table smc_table[] = { + { + .procname = "wmem_default", + .data = &init_net.smc.sysctl_wmem_default, + .maxlen = sizeof(init_net.smc.sysctl_wmem_default), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &init_net.smc.sysctl_rmem_default, + .maxlen = sizeof(init_net.smc.sysctl_rmem_default), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { } +}; + +static __net_init int smc_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void smc_sysctl_exit_net(struct net *net) +{ + unregister_net_sysctl_table(net->smc.smc_hdr); +} + +static struct pernet_operations smc_sysctl_ops __net_initdata = { + .init = smc_sysctl_init_net, + .exit = smc_sysctl_exit_net, +}; + +int __init smc_sysctl_init(void) +{ + return register_pernet_subsys(&smc_sysctl_ops); +} + +void smc_sysctl_exit(void) +{ + unregister_pernet_subsys(&smc_sysctl_ops); +} -- Gitee From 4c27655d025e6f67830481defabdf8b03baf996d Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:14 +0800 Subject: [PATCH 064/148] Revert "anolis: Revert "anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace"" This reverts commit 934ae72347d85a854f85c6a5a15efc6800987e3f. --- net/smc/smc.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 1505df7d98c4..f794e3fc4d43 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,10 +21,6 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 - -#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ -#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ - #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From 60e9b434139b042f6bb1948b48ff1856518b5bc0 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:16 +0800 Subject: [PATCH 065/148] Revert "anolis: Revert "anolis: net/smc: Introduce sysctl tcp2smc"" This reverts commit 25cb1161c680909d6e43c2022965b27d4735c0a2. --- net/smc/af_smc.c | 3 +++ net/smc/smc_sysctl.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e7cb32904ba3..ed551eb721c7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2753,6 +2753,7 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; + net->smc.sysctl_tcp2smc = 0; } return smc_pnet_net_init(net); @@ -2760,6 +2761,7 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { + net->smc.sysctl_tcp2smc = 0; smc_pnet_net_exit(net); } @@ -2869,6 +2871,7 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; + init_net.smc.sysctl_tcp2smc = 0; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 317b37095c4f..e3942837c3e3 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -28,6 +28,13 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, + { + .procname = "tcp2smc", + .data = &init_net.smc.sysctl_tcp2smc, + .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; -- Gitee From 45314fd95becb684b9f5a43cf53e51984f5cb574 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:18 +0800 Subject: [PATCH 066/148] Revert "anolis: Revert "anolis: net/smc: Introduce SMC-R-related proc files"" This reverts commit d6c29ecb7092f2ad6884f11c9804581247de4f8c. --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 27 ++++- net/smc/smc_diag.c | 29 ++--- net/smc/smc_proc.c | 287 +++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_proc.h | 34 ++++++ 5 files changed, 358 insertions(+), 21 deletions(-) create mode 100644 net/smc/smc_proc.c create mode 100644 net/smc/smc_proc.h diff --git a/net/smc/Makefile b/net/smc/Makefile index 640af9a39f9c..19076ff20d58 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o +smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ed551eb721c7..4d1493a5aa92 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -52,6 +52,7 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" +#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -86,11 +87,13 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - head = &h->ht; - write_lock_bh(&h->lock); + + head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); return 0; @@ -2789,7 +2792,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc; + int rc, i; int max_rshare, max_wshare; unsigned long limit; @@ -2856,13 +2859,22 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); + + for (i = 0; i < SMC_HTABLE_SIZE; i++) { + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); + } + + rc = smc_proc_init(); + if (rc) { + pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); + goto out_sock; + } rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_proc; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -2880,6 +2892,8 @@ static int __init smc_init(void) static_branch_enable(&tcp_have_smc); return 0; +out_proc: + smc_proc_exit(); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -2905,6 +2919,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index c952986a6aca..40036e9926e0 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,24 +196,25 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0; + int rc = 0, num = 0, slot; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - head = &prot->h.smc_hash->ht; - if (hlist_empty(head)) - goto out; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + + for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { + head = &prot->h.smc_hash->ht[slot]; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; + num++; + } } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c new file mode 100644 index 000000000000..19d8cc82a7ac --- /dev/null +++ b/net/smc/smc_proc.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include "smc.h" +#include "smc_proc.h" +#include "smc_core.h" + +static void *smc_get_next(struct seq_file *seq, void *cur) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + struct net *net = seq_file_net(seq); + struct hlist_head *head; + struct sock *sk = cur; + + if (!sk) { + read_lock(&smc_hash->lock); +get_head: + head = &smc_hash->ht[sp->bucket]; + sk = sk_head(head); + sp->offset = 0; + goto get_sk; + } + ++sp->num; + ++sp->offset; + + sk = sk_next(sk); +get_sk: + sk_for_each_from(sk) { + if (!net_eq(sock_net(sk), net)) + continue; + return sk; + } + sp->offset = 0; + if (++sp->bucket < SMC_HTABLE_SIZE) + goto get_head; + + read_unlock(&smc_hash->lock); + return NULL; +} + +static void *smc_seek_last_pos(struct seq_file *seq) +{ + struct smc_proc_private *sp = seq->private; + int offset = sp->offset; + int orig_num = sp->num; + void *rc = NULL; + + if (sp->bucket >= SMC_HTABLE_SIZE) + goto out; + + rc = smc_get_next(seq, NULL); + while (offset-- && rc) + rc = smc_get_next(seq, rc); + + if (rc) + goto out; + + sp->bucket = 0; +out: + sp->num = orig_num; + return rc; +} + +static void *smc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + sp->bucket = 0; + rc = smc_get_next(seq, NULL); + + while (rc && pos) { + rc = smc_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + if (*pos && *pos == sp->last_pos) { + rc = smc_seek_last_pos(seq); + if (rc) + goto out; + } + + sp->num = 0; + sp->bucket = 0; + sp->offset = 0; + sp->protocol = protocol; + rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + sp->last_pos = *pos; + return rc; +} + +static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC); +} + +static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC6); +} + +static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) +{ + struct smc_proc_private *sp = seq->private; + const struct in6_addr *dest, *src; + struct smc_link_group *lgr; + struct socket *clcsock; + struct smc_link *lnk; + struct sock *sk; + bool fb = false; + int i; + + fb = smc->use_fallback; + clcsock = smc->clcsock; + sk = &smc->sk; + + if (protocol == SMCPROTO_SMC) + seq_printf(seq, CONN4_ADDR_FM, sp->num, + clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, + clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); + else if (protocol == SMCPROTO_SMC6) { + dest = &clcsock->sk->sk_v6_daddr; + src = &clcsock->sk->sk_v6_rcv_saddr; + seq_printf(seq, CONN6_ADDR_FM, sp->num, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); + } + + seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, + sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); + + lgr = smc->conn.lgr; + lnk = smc->conn.lnk; + + if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { + for (i = 0; i < SMC_LGR_ID_SIZE; i++) + seq_printf(seq, "%02X", lgr->id[i]); + + seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', + lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, + lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + } else { + seq_puts(seq, "- - - - - - - -\n"); + } +} + +static int smc_conn_show(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct socket *clcsock; + struct smc_sock *smc; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, + "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", + "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", + "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + goto out; + } + + smc = smc_sk(v); + clcsock = smc->clcsock; + if (!clcsock) + goto out; + + _conn_show(seq, smc, sp->protocol); +out: + return 0; +} + +static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc = NULL; + + if (v == SEQ_START_TOKEN) { + rc = smc_get_idx(seq, 0); + goto out; + } + rc = smc_get_next(seq, v); +out: + ++*pos; + sp->last_pos = *pos; + return rc; +} + +static void smc_conn_stop(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + + if (v && v != SEQ_START_TOKEN) + read_unlock(&smc_hash->lock); +} + +static struct smc_proc_entry smc_proc[] = { + { + .name = "smc4", + .ops = { + .show = smc_conn_show, + .start = smc_conn4_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .name = "smc6", + .ops = { + .show = smc_conn_show, + .start = smc_conn6_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#endif +}; + +static int __net_init smc_proc_dir_init(struct net *net) +{ + int i, rc = -ENOMEM; + + net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); + if (!net->proc_net_smc) + goto err; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { + if (!proc_create_net_data(smc_proc[i].name, 0444, + net->proc_net_smc, &smc_proc[i].ops, + sizeof(struct smc_proc_private), + NULL)) + goto err_entry; + } + + return 0; + +err_entry: + for (i -= 1; i >= 0; i--) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +err: + return rc; +} + +static void __net_exit smc_proc_dir_exit(struct net *net) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +} + +static struct pernet_operations smc_proc_ops = { + .init = smc_proc_dir_init, + .exit = smc_proc_dir_exit, +}; + +int __init smc_proc_init(void) +{ + return register_pernet_subsys(&smc_proc_ops); +} + +void smc_proc_exit(void) +{ + unregister_pernet_subsys(&smc_proc_ops); +} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h new file mode 100644 index 000000000000..ec59ca03e163 --- /dev/null +++ b/net/smc/smc_proc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _SMC_PROC_H_ +#define _SMC_PROC_H_ + +#include +#include +#include +#include +#include +#include "smc.h" + +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") +#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") + +struct smc_proc_private { + struct seq_net_private p; + int num, bucket, offset; + int protocol; + loff_t last_pos; +}; + +struct smc_proc_entry { + const char *name; + const struct seq_operations ops; +}; + +int __init smc_proc_init(void); +void smc_proc_exit(void); + +#endif -- Gitee From 286b5e918718ff9e5ceac05bfa8550574a11bb54 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:20 +0800 Subject: [PATCH 067/148] Revert "anolis: Revert "anolis: net/smc: Introduce TCP to SMC replacement netlink commands"" This reverts commit b265f780b441a07c25c5238acd8ddabfdd48fcc0. --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 12 ++- net/smc/smc_conv.c | 186 ++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_conv.h | 22 +++++ net/smc/smc_netlink.c | 19 ++++- net/smc/smc_netlink.h | 5 ++ 6 files changed, 243 insertions(+), 3 deletions(-) create mode 100644 net/smc/smc_conv.c create mode 100644 net/smc/smc_conv.h diff --git a/net/smc/Makefile b/net/smc/Makefile index 19076ff20d58..72b3c934e473 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o +smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o smc_conv.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4d1493a5aa92..1221dca53654 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,7 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_proc.h" +#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -2871,10 +2872,16 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_conv_init(); + if (rc) { + pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); + goto out_proc; + } + rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_proc; + goto out_conv; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -2892,6 +2899,8 @@ static int __init smc_init(void) static_branch_enable(&tcp_have_smc); return 0; +out_conv: + smc_conv_exit(); out_proc: smc_proc_exit(); out_sock: @@ -2919,6 +2928,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c new file mode 100644 index 000000000000..e1f87d1de8a5 --- /dev/null +++ b/net/smc/smc_conv.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include "smc_netlink.h" +#include "smc_conv.h" + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *wlist_elem, *tmp; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + if (*wlist_len >= SMC_MAX_WLIST_LEN) { + mutex_unlock(wlist_lock); + return -EINVAL; + } + + list_for_each_entry(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) + goto out; + } + + wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); + if (!wlist_elem) { + mutex_unlock(wlist_lock); + return -ENOMEM; + } + + strcpy(wlist_elem->task_comm, msg); + list_add_tail_rcu(&wlist_elem->list, wlist); + ++*wlist_len; +out: + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *tmp, *nxt; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + list_for_each_entry_safe(tmp, nxt, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) { + list_del_rcu(&tmp->list); + synchronize_rcu(); + kfree(tmp); + --*wlist_len; + break; + } + } + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_conv_wlist_elem *tmp; + void *nlh; + + if (cb_ctx->pos[0]) + goto errmsg; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_TCP2SMC_WLIST); + if (!nlh) + goto errmsg; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, + nla_total_size(strlen(tmp->task_comm) + 1), + tmp->task_comm)) { + rcu_read_unlock(); + goto errattr; + } + } + rcu_read_unlock(); + + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_match_tcp2smc_wlist(struct net *net, char *comm) +{ + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_conv_wlist_elem *tmp; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, comm)) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + return -1; +} + +static int __net_init smc_net_conv_init(struct net *net) +{ + INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); + net->smc.smc_conv.wlist_len = 0; + + mutex_init(&net->smc.smc_conv.wlist_lock); + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, + smc_match_tcp2smc_wlist); + return 0; +} + +static void __net_exit smc_net_conv_exit(struct net *net) +{ + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *cur, *nxt; + struct list_head tmp_list; + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); + synchronize_rcu(); + + INIT_LIST_HEAD(&tmp_list); + + mutex_lock(wlist_lock); + list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); + *wlist_len = 0; + mutex_unlock(wlist_lock); + + list_for_each_entry_safe(cur, nxt, &tmp_list, list) { + list_del(&cur->list); + kfree(cur); + } +} + +static struct pernet_operations smc_conv_ops = { + .init = smc_net_conv_init, + .exit = smc_net_conv_exit, +}; + +int __init smc_conv_init(void) +{ + return register_pernet_subsys(&smc_conv_ops); +} + +void smc_conv_exit(void) +{ + unregister_pernet_subsys(&smc_conv_ops); +} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h new file mode 100644 index 000000000000..1615b27feede --- /dev/null +++ b/net/smc/smc_conv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef NET_SMC_SMC_CONV_H_ +#define NET_SMC_SMC_CONV_H_ +#include +#include +#include + +#define SMC_MAX_WLIST_LEN 32 + +struct smc_conv_wlist_elem { + char task_comm[TASK_COMM_LEN]; + struct list_head list; +}; + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); +int __init smc_conv_init(void); +void smc_conv_exit(void); + +#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index f13ab0661ed5..f2007aa124cf 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,6 +22,7 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" +#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -111,9 +112,25 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_seid, }, + { + .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_add_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_del_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_tcp2smc_wlist, + }, }; -static const struct nla_policy smc_gen_nl_policy[2] = { +static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { + [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index e8c6c3f0e98c..aae13737095e 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,6 +15,11 @@ #include #include +enum { + SMC_CMD_ATTR_TCP2SMC = 1, + SMC_CMD_MAX_ATTR, +}; + extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; -- Gitee From 0a1e763d0cb27538d4047b6a495d3cdb5af130cf Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:23 +0800 Subject: [PATCH 068/148] Revert "anolis: Revert "anolis: net/smc: Add TX and RX diagnosis information"" This reverts commit 96efc0d3a16d0c622e3c9cf75c3447ad8196d8d9. --- net/smc/smc.h | 6 ++++++ net/smc/smc_core.c | 15 +++++++++++++++ net/smc/smc_diag.c | 6 ++++++ net/smc/smc_rx.c | 2 ++ net/smc/smc_tx.c | 9 +++++++-- 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index f794e3fc4d43..f158e552f43c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -210,6 +210,12 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ + u64 rx_cnt; /* rx counter */ + u64 tx_cnt; /* tx counter */ + u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + u64 rx_bytes; /* rx size */ + u64 tx_bytes; /* tx size */ + u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 out_of_sync : 1; /* out of sync with peer */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 368b0bc5064c..e93458e3e7a4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1774,6 +1774,20 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } +static void smc_rx_tx_counter_init(struct smc_connection *conn) +{ + /* Initialize RX & TX diagnostic inform for each + * connection. These counters mean what smc wants + * net devices "TODO" insead of what has been "DONE" + */ + conn->rx_cnt = 0; + conn->tx_cnt = 0; + conn->tx_corked_cnt = 0; + conn->rx_bytes = 0; + conn->tx_bytes = 0; + conn->tx_corked_bytes = 0; +} + /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1849,6 +1863,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 40036e9926e0..1fa7c7cf9332 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,6 +136,12 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, + .rx_cnt = conn->rx_cnt, + .tx_cnt = conn->tx_cnt, + .tx_corked_cnt = conn->tx_corked_cnt, + .rx_bytes = conn->rx_bytes, + .tx_bytes = conn->tx_bytes, + .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 51e8eb2933ff..bf353c68323d 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -392,6 +392,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); + conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -441,6 +442,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); + ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 02d147bde78c..82735741bc2a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -239,14 +239,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) + (conn->sndbuf_desc->len >> 1))) { /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ + conn->tx_corked_bytes += copylen; + ++conn->tx_corked_cnt; queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_CORK_DELAY); - else + } else { + conn->tx_bytes += copylen; + ++conn->tx_cnt; smc_tx_sndbuf_nonempty(conn); + } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From 06f83fc821e072b779b2bbd39b6ce9e62784932b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:25 +0800 Subject: [PATCH 069/148] Revert "anolis: Revert "anolis: net/smc: Add SMC-R link-down counters"" This reverts commit 9a64681d63797dc97ef05464ee474a58ebd1a1c5. --- net/smc/smc_core.c | 14 +++++++++++--- net/smc/smc_core.h | 2 ++ net/smc/smc_diag.c | 2 ++ net/smc/smc_llc.c | 3 +++ net/smc/smc_tx.c | 4 +++- net/smc/smc_wr.c | 5 +++++ 6 files changed, 26 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e93458e3e7a4..e93d3ce74951 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -767,6 +767,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; lnk->link_idx = link_idx; + lnk->link_down_cnt_smc = 0; + lnk->link_down_cnt_ib = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); @@ -1071,16 +1073,20 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) + if (rc) { + ++to_lnk->link_down_cnt_smc; goto err_out; + } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) + if (rc) { + ++to_lnk->link_down_cnt_ib; goto err_out; + } goto again; } read_unlock_bh(&lgr->conns_lock); @@ -1676,8 +1682,10 @@ void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && - lnk->smcibdev == smcibdev && lnk->ibport == ibport) + lnk->smcibdev == smcibdev && lnk->ibport == ibport) { + ++lnk->link_down_cnt_ib; smcr_link_down_cond_sched(lnk); + } } } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 93e0e6c647ba..28d0bcb5759f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -148,6 +148,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + u64 link_down_cnt_smc; /* smc-caused link down counter */ + u64 link_down_cnt_ib; /* ib-caused link down counter */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 1fa7c7cf9332..ddecd39aa4a4 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -155,6 +155,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .role = smc->conn.lgr->role, .lnk[0].ibport = smc->conn.lnk->ibport, .lnk[0].link_id = smc->conn.lnk->link_id, + .lnk[0].link_down_cnt_smc = smc->conn.lnk->link_down_cnt_smc, + .lnk[0].link_down_cnt_ib = smc->conn.lnk->link_down_cnt_ib, }; memcpy(linfo.lnk[0].ibname, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index b74342c8433e..b8587cb50300 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1281,12 +1281,14 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_NO_ASYM_NEEDED); if (rc) { + ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, SMC_LLC_DELETE_LINK); if (!qentry) { + ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } @@ -2100,6 +2102,7 @@ static void smc_llc_testlink_work(struct work_struct *work) if (!smc_link_active(link)) return; /* link state changed */ if (rc <= 0) { + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); return; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 82735741bc2a..efe0b393a5fe 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -304,8 +304,10 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) + if (rc) { + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); + } return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 600ab5889227..fe7d4e05722e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -142,6 +142,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2)); } + ++link->link_down_cnt_ib; /* terminate link */ smcr_link_down_cond_sched(link); } @@ -236,6 +237,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { + ++link->link_down_cnt_smc; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -345,6 +347,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); } return rc; @@ -399,6 +402,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (atomic_dec_and_test(&link->wr_reg_refcnt)) wake_up_all(&link->wr_reg_wait); if (!rc) { + ++link->link_down_cnt_ib; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -498,6 +502,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: + ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); break; default: -- Gitee From 422ec054cf050ad1fc9a3d0e0b73751aea61c4c0 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:27 +0800 Subject: [PATCH 070/148] Revert "anolis: Revert "anolis: net/smc: don't call ib_req_notify_cq in the send routine"" This reverts commit fa3fdc3e1387ce02596c51f054d214568f44a7f5. --- net/smc/smc_ib.c | 6 ++++++ net/smc/smc_wr.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 5351fe1a167e..331280e00a5a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,6 +135,12 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; + + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index fe7d4e05722e..2aebdf49020f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -325,8 +325,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From aff423d9d939196287b97d3668f5043bcd51dfbc Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:29 +0800 Subject: [PATCH 071/148] Revert "anolis: Revert "anolis: net/smc: support auto-cork with nagle algorithm"" This reverts commit c5f92e224ac646263206f38d5be12adadb461546. --- net/smc/af_smc.c | 24 ++----------- net/smc/smc.h | 2 ++ net/smc/smc_cdc.c | 14 +++++++- net/smc/smc_cdc.h | 2 ++ net/smc/smc_sysctl.c | 9 +++++ net/smc/smc_tx.c | 86 +++++++++++++++++++++++++++++++++++++------- 6 files changed, 101 insertions(+), 36 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1221dca53654..835731e80413 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2476,28 +2476,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, rc = -EINVAL; } break; - case TCP_NODELAY: - if (sk->sk_state != SMC_INIT && - sk->sk_state != SMC_LISTEN && - sk->sk_state != SMC_CLOSED) { - if (val) { - SMC_STAT_INC(smc, ndly_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); - } - } - break; - case TCP_CORK: - if (sk->sk_state != SMC_INIT && - sk->sk_state != SMC_LISTEN && - sk->sk_state != SMC_CLOSED) { - if (!val) { - SMC_STAT_INC(smc, cork_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); - } - } - break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; @@ -2758,6 +2736,7 @@ static __net_init int smc_net_init(struct net *net) net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_autocorking = 1; } return smc_pnet_net_init(net); @@ -2891,6 +2870,7 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; + init_net.smc.sysctl_autocorking = 1; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc.h b/net/smc/smc.h index f158e552f43c..cf9f92b4c1b9 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -178,6 +178,8 @@ struct smc_connection { spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ + atomic_t cdc_pend_tx_wr; /* pending tx CDC wr */ + atomic_t tx_pushing; /* num of user trying tx push */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 99acd337ba90..e4e3cebff922 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -51,6 +51,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn); conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } + /* If this is the last pending WR complete, push them to prevent + * no one trying to push when corked. + */ + if (likely(!cdcpend->validation) && + atomic_dec_and_test(&conn->cdc_pend_tx_wr)) + smc_tx_sndbuf_nonempty(conn); + smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } @@ -106,14 +113,18 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + atomic_dec(&conn->cdc_pend_tx_wr); } return rc; @@ -135,6 +146,7 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ peer->token = htonl(local->token); peer->prod_flags.failover_validation = 1; + pend->validation = 1; rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 0a0a89abd38b..b790f2af8607 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -284,6 +284,8 @@ struct smc_cdc_tx_pend { union smc_host_cursor cursor; /* tx sndbuf cursor sent */ union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ u16 ctrl_seq; /* conn. tx sequence # */ + u16 validation:1; + u16 reserved:15; }; int smc_cdc_get_free_slot(struct smc_connection *conn, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e3942837c3e3..a9fca59512bd 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -35,6 +35,15 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "autocorking", + .data = &init_net.smc.sysctl_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index efe0b393a5fe..8f1d8cd13191 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -31,7 +31,6 @@ #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 -#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -125,11 +124,37 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) return rc; } -static bool smc_tx_is_corked(struct smc_sock *smc) +/* Strategy: Nagle algorithm + * 1. The first message should never cork + * 2. If we have any inflight messages, wait for the first + * message back + * 3. The total corked message should not exceed min(64k, sendbuf/2) + */ +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) { - struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); - - return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; + struct smc_connection *conn = &smc->conn; + int prepared_send; + + /* First request && no more message should always pass */ + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 && + !(msg->msg_flags & MSG_MORE)) + return false; + + /* If We have enough data in the send queue that have not been + * pushed, send immediately. + * Note, here we only care about the prepared_sends, but not + * sendbuf_space because sendbuf_space has nothing to do with + * corked data size. + */ + prepared_send = smc_tx_prepared_sends(conn); + if (prepared_send > min(64 * 1024, conn->sndbuf_desc->len >> 1)) + return false; + + if (!sock_net(&smc->sk)->smc.sysctl_autocorking) + return false; + + /* All the other conditions should cork */ + return true; } /* sndbuf producer: main API called by socket layer. @@ -178,6 +203,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; + /* If our send queue is full but peer have RMBE space, + * we should send them out before wait + */ + if (!atomic_read(&conn->sndbuf_space) && + atomic_read(&conn->peer_rmbe_space) > 0) + smc_tx_sndbuf_nonempty(conn); + if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { @@ -237,19 +269,17 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) { + if (smc_tx_should_cork(smc, msg)) { /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ conn->tx_corked_bytes += copylen; ++conn->tx_corked_cnt; - queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, - SMC_TX_CORK_DELAY); } else { conn->tx_bytes += copylen; ++conn->tx_cnt; + if (delayed_work_pending(&conn->tx_work)) + cancel_delayed_work(&conn->tx_work); smc_tx_sndbuf_nonempty(conn); } @@ -586,11 +616,31 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + int rc = 0; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + /* Only let one to push to prevent wasting of CPU and CDC slot */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -602,6 +652,16 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) conn); smc_close_wake_tx_prepared(smc); } + +out: + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed when we are pushing. + * If so, we need to try push again to prevent those data in the + * send queue may never been pushed out + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } -- Gitee From 47cca3e06200cc7c43c790adaa8f9eebada0fb41 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:31 +0800 Subject: [PATCH 072/148] Revert "anolis: Revert "anolis: net/smc: allow different subnet communication"" This reverts commit 307bd9d02d4d3e8ce7dbc50d1506f78ec23ef985. --- net/smc/af_smc.c | 14 ++++++++++---- net/smc/smc_sysctl.c | 9 +++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 835731e80413..155e8a2be212 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1870,6 +1870,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { + struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -1877,10 +1878,12 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + if (!net->smc.sysctl_allow_different_subnet) { + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); + } /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2737,6 +2740,7 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_autocorking = 1; + net->smc.sysctl_allow_different_subnet = 0; } return smc_pnet_net_init(net); @@ -2745,6 +2749,7 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_allow_different_subnet = 0; smc_pnet_net_exit(net); } @@ -2871,6 +2876,7 @@ static int __init smc_init(void) init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_autocorking = 1; + init_net.smc.sysctl_allow_different_subnet = 0; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a9fca59512bd..ad4c1c8a51cc 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -44,6 +44,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "allow_different_subnet", + .data = &init_net.smc.sysctl_allow_different_subnet, + .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- Gitee From 04c5af30b41be7caa5f0914643b7acd70d966fe1 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:33 +0800 Subject: [PATCH 073/148] Revert "anolis: Revert "anolis: Revert "net/smc: Avoid warning of possible recursive locking""" This reverts commit 29cb32fe12d57f9396ca37538c320d56c5b004c5. --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 155e8a2be212..99e0023ca6d6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -592,7 +592,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * to clcsocket->wq during the fallback. */ spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); + spin_lock(&clc_wait->lock); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); -- Gitee From 228e0e5cce99325a69722c859d493c28ad9c9cf5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:35 +0800 Subject: [PATCH 074/148] Revert "anolis: Revert "anolis: Revert "net/smc: Transfer remaining wait queue entries during fallback""" This reverts commit 200a072892faf3b06e18b09677799e8aface6063. --- net/smc/af_smc.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 99e0023ca6d6..5be0224705c4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -573,10 +573,6 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); - unsigned long flags; - smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -586,16 +582,6 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. - */ - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock(&clc_wait->lock); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); } } -- Gitee From d79ce506563248d15c20766a7a488ac35ba02e8f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:37 +0800 Subject: [PATCH 075/148] Revert "anolis: Revert "anolis: net/smc: Forward wake-up to smc socket wait queue when fallback"" This reverts commit e145f3de04743ca195d9b53aa2ebf829e81179fc. --- net/smc/af_smc.c | 134 +++++++++++++++++++++++++++++++++++++++++++---- net/smc/smc.h | 11 ++++ 2 files changed, 135 insertions(+), 10 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5be0224705c4..cc7f0ccdf383 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -67,6 +67,10 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +static void smc_clcsock_state_change(struct sock *clcsk); +static void smc_clcsock_data_ready(struct sock *clcsk); +static void smc_clcsock_write_space(struct sock *clcsk); +static void smc_clcsock_error_report(struct sock *clcsk); static void smc_set_keepalive(struct sock *sk, int val) { @@ -573,6 +577,8 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { + struct sock *clcsk = smc->clcsock->sk; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -582,6 +588,19 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_clcsock_state_change; + clcsk->sk_data_ready = smc_clcsock_data_ready; + clcsk->sk_write_space = smc_clcsock_write_space; + clcsk->sk_error_report = smc_clcsock_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } } @@ -2080,22 +2099,117 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -static void smc_clcsock_data_ready(struct sock *listen_clcsock) +static void smc_wake_up_waitqueue(struct smc_sock *smc, void *key) { - struct smc_sock *lsmc; + struct socket_wq *wq; + __poll_t flags; + + rcu_read_lock(); + wq = rcu_dereference(smc->sk.sk_wq); + if (skwq_has_sleeper(wq)) { + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } + } + rcu_read_unlock(); +} + +static int smc_mark_clcwq_woken(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct smc_wait_private *priv = wait->private; + + priv->woken = true; + priv->key = key; + return 0; +} + +static void smc_forward_wake_up_waitqueue(struct smc_sock *smc, struct sock *clcsk, + void (*clcsk_callback)(struct sock *sk)) +{ + struct smc_wait_private wait_priv; + struct wait_queue_entry wait; - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!lsmc) + init_waitqueue_func_entry(&wait, smc_mark_clcwq_woken); + wait_priv.woken = false; + wait.private = &wait_priv; + + add_wait_queue(sk_sleep(clcsk), &wait); + clcsk_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &wait); + + if (wait_priv.woken) + smc_wake_up_waitqueue(smc, wait_priv.key); +} + +static void smc_clcsock_state_change(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) return; - lsmc->clcsk_data_ready(listen_clcsock); - if (lsmc->sk.sk_state == SMC_LISTEN) { - sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) - sock_put(&lsmc->sk); + + smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_state_change); +} + +static void smc_clcsock_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + if (!smc->use_fallback) { + /* listening situation */ + smc->clcsk_data_ready(clcsk); + if (smc->sk.sk_state == SMC_LISTEN) { + sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_hs_wq, &smc->tcp_listen_work)) + sock_put(&smc->sk); + } + } else { + /* fallback situation */ + smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_data_ready); } } +static void smc_clcsock_write_space(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_write_space); +} + +static void smc_clcsock_error_report(struct sock *clcsk) +{ + struct smc_sock *smc; + + smc = (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!smc) + return; + + smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_error_report); +} + static int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; diff --git a/net/smc/smc.h b/net/smc/smc.h index cf9f92b4c1b9..3d0ca28e7395 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -135,6 +135,11 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_wait_private { + bool woken; + void *key; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -226,8 +231,14 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. **/ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ -- Gitee From c4719820ed3a0ade570ad950bb4b925af467dd6b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:39 +0800 Subject: [PATCH 076/148] Revert "anolis: net/smc: Supplement for SMC-R iWARP support" This reverts commit 347bd8a2484cbfb156bd4221f6eb0e1aa57057df. --- include/rdma/ib_verbs.h | 2 -- net/smc/smc_ib.c | 6 ------ 2 files changed, 8 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a155f6d28ce2..c8cbf7e39a78 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1133,8 +1133,6 @@ enum ib_qp_create_flags { IB_QP_CREATE_SOURCE_QPN = 1 << 10, IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, - - IB_QP_CREATE_IWARP_WITHOUT_CM = 1 << 25, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 331280e00a5a..00cc19d311b4 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -680,14 +680,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, }; - struct ib_device *ib_dev = lnk->smcibdev->ibdev; - struct ib_port_immutable immutable; int rc; - ib_dev->ops.get_port_immutable(ib_dev, lnk->ibport, &immutable); - if (immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP) - qp_attr.create_flags |= IB_QP_CREATE_IWARP_WITHOUT_CM; - lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); if (IS_ERR(lnk->roce_qp)) -- Gitee From 22145f532ca7c5e0789db998104ffdb5618185ab Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:41 +0800 Subject: [PATCH 077/148] Revert "anolis: net/smc: Introduce iWARP device support" This reverts commit 1245b9c9ad71090d83f4aec076ef1413ea76bee7. --- net/smc/smc_ib.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 00cc19d311b4..55435499dc3c 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -229,7 +229,7 @@ static int smc_ib_determine_gid_rcu(const struct net_device *ndev, u8 gid[], u8 *sgid_index, struct smc_init_info_smcrv2 *smcrv2) { - if (!smcrv2) { + if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { if (gid) memcpy(gid, &attr->gid, SMC_GID_SIZE); if (sgid_index) @@ -284,11 +284,10 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); - if ((smcibdev->ibdev->port_data[ibport].immutable.core_cap_flags & - RDMA_CORE_CAP_PROT_IWARP) || (!IS_ERR(ndev) && + if (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(ndev)) || (vlan_id && is_vlan_dev(ndev) && - vlan_dev_vlan_id(ndev) == vlan_id)))) { + vlan_dev_vlan_id(ndev) == vlan_id))) { if (!smc_ib_determine_gid_rcu(ndev, attr, gid, sgid_index, smcrv2)) { rcu_read_unlock(); @@ -912,8 +911,7 @@ static int smc_ib_add_dev(struct ib_device *ibdev) u8 port_cnt; int i; - if (ibdev->node_type != RDMA_NODE_IB_CA && - ibdev->node_type != RDMA_NODE_RNIC) + if (ibdev->node_type != RDMA_NODE_IB_CA) return -EOPNOTSUPP; smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); -- Gitee From 1a3a873250ad1020fca299455574e2856c8ed2ca Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:43 +0800 Subject: [PATCH 078/148] Revert "anolis: net/smc: Forward wake-up to smc socket wait queue when fallback" This reverts commit a509eb9707a4c99c6cd20dfd7c96350915477fa7. --- net/smc/af_smc.c | 134 ++++------------------------------------------- net/smc/smc.h | 11 ---- 2 files changed, 10 insertions(+), 135 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cc7f0ccdf383..5be0224705c4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -67,10 +67,6 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -static void smc_clcsock_state_change(struct sock *clcsk); -static void smc_clcsock_data_ready(struct sock *clcsk); -static void smc_clcsock_write_space(struct sock *clcsk); -static void smc_clcsock_error_report(struct sock *clcsk); static void smc_set_keepalive(struct sock *sk, int val) { @@ -577,8 +573,6 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - struct sock *clcsk = smc->clcsock->sk; - smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -588,19 +582,6 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_clcsock_state_change; - clcsk->sk_data_ready = smc_clcsock_data_ready; - clcsk->sk_write_space = smc_clcsock_write_space; - clcsk->sk_error_report = smc_clcsock_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } } @@ -2099,117 +2080,22 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -static void smc_wake_up_waitqueue(struct smc_sock *smc, void *key) -{ - struct socket_wq *wq; - __poll_t flags; - - rcu_read_lock(); - wq = rcu_dereference(smc->sk.sk_wq); - if (skwq_has_sleeper(wq)) { - if (!key) { - /* sk_state_change */ - wake_up_interruptible_all(&wq->wait); - } else { - flags = key_to_poll(key); - if (flags & (EPOLLIN | EPOLLOUT)) - /* sk_data_ready or sk_write_space */ - wake_up_interruptible_sync_poll(&wq->wait, flags); - else if (flags & EPOLLERR) - /* sk_error_report */ - wake_up_interruptible_poll(&wq->wait, flags); - } - } - rcu_read_unlock(); -} - -static int smc_mark_clcwq_woken(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *key) -{ - struct smc_wait_private *priv = wait->private; - - priv->woken = true; - priv->key = key; - return 0; -} - -static void smc_forward_wake_up_waitqueue(struct smc_sock *smc, struct sock *clcsk, - void (*clcsk_callback)(struct sock *sk)) +static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_wait_private wait_priv; - struct wait_queue_entry wait; - - init_waitqueue_func_entry(&wait, smc_mark_clcwq_woken); - wait_priv.woken = false; - wait.private = &wait_priv; - - add_wait_queue(sk_sleep(clcsk), &wait); - clcsk_callback(clcsk); - remove_wait_queue(sk_sleep(clcsk), &wait); - - if (wait_priv.woken) - smc_wake_up_waitqueue(smc, wait_priv.key); -} - -static void smc_clcsock_state_change(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_state_change); -} - -static void smc_clcsock_data_ready(struct sock *clcsk) -{ - struct smc_sock *smc; + struct smc_sock *lsmc; - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) + lsmc = (struct smc_sock *) + ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!lsmc) return; - - if (!smc->use_fallback) { - /* listening situation */ - smc->clcsk_data_ready(clcsk); - if (smc->sk.sk_state == SMC_LISTEN) { - sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &smc->tcp_listen_work)) - sock_put(&smc->sk); - } - } else { - /* fallback situation */ - smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_data_ready); + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); } } -static void smc_clcsock_write_space(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_write_space); -} - -static void smc_clcsock_error_report(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up_waitqueue(smc, clcsk, smc->clcsk_error_report); -} - static int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; diff --git a/net/smc/smc.h b/net/smc/smc.h index 3d0ca28e7395..cf9f92b4c1b9 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -135,11 +135,6 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; -struct smc_wait_private { - bool woken; - void *key; -}; - struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -231,14 +226,8 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ - void (*clcsk_state_change)(struct sock *sk); - /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. **/ - void (*clcsk_write_space)(struct sock *sk); - /* original write_space fct. */ - void (*clcsk_error_report)(struct sock *sk); - /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ -- Gitee From 2c8c4e7e0278fd1d3090f0005028550b047285c7 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:45 +0800 Subject: [PATCH 079/148] Revert "anolis: Revert "net/smc: Transfer remaining wait queue entries during fallback"" This reverts commit 009281f565cde730371478e02ad0b4cd3c49318a. --- net/smc/af_smc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5be0224705c4..99e0023ca6d6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -573,6 +573,10 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { + wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); + wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + unsigned long flags; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -582,6 +586,16 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + /* There may be some entries remaining in + * smc socket->wq, which should be removed + * to clcsocket->wq during the fallback. + */ + spin_lock_irqsave(&smc_wait->lock, flags); + spin_lock(&clc_wait->lock); + list_splice_init(&smc_wait->head, &clc_wait->head); + spin_unlock(&clc_wait->lock); + spin_unlock_irqrestore(&smc_wait->lock, flags); } } -- Gitee From 5d18e966b387f35f9b239464ad3074b8a469e700 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:47 +0800 Subject: [PATCH 080/148] Revert "anolis: Revert "net/smc: Avoid warning of possible recursive locking"" This reverts commit f3254be730fae5f74ada714aeb910851e3504d04. --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 99e0023ca6d6..155e8a2be212 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -592,7 +592,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * to clcsocket->wq during the fallback. */ spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock(&clc_wait->lock); + spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); -- Gitee From 48b3a9f7e52410b2d16cdf7684b95ff3e433186a Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:50 +0800 Subject: [PATCH 081/148] Revert "anolis: net/smc: allow different subnet communication" This reverts commit 15b7c8bf53a45f265927ce398e0bce54e1887951. --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 14 ++++---------- net/smc/smc_sysctl.c | 9 --------- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index c531cb2aac8b..7f8256f3247d 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -27,7 +27,6 @@ struct netns_smc { int sysctl_rmem_default; int sysctl_tcp2smc; int sysctl_autocorking; - int sysctl_allow_different_subnet; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 155e8a2be212..835731e80413 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1870,7 +1870,6 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -1878,12 +1877,10 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - if (!net->smc.sysctl_allow_different_subnet) { - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); - } + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2740,7 +2737,6 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_autocorking = 1; - net->smc.sysctl_allow_different_subnet = 0; } return smc_pnet_net_init(net); @@ -2749,7 +2745,6 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 0; smc_pnet_net_exit(net); } @@ -2876,7 +2871,6 @@ static int __init smc_init(void) init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_autocorking = 1; - init_net.smc.sysctl_allow_different_subnet = 0; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index ad4c1c8a51cc..a9fca59512bd 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -44,15 +44,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "allow_different_subnet", - .data = &init_net.smc.sysctl_allow_different_subnet, - .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; -- Gitee From 6749f36e3df602287569284217418a6a1b4dbd3e Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:52 +0800 Subject: [PATCH 082/148] Revert "anolis: net/smc: support auto-cork with nagle algorithm" This reverts commit bc6e499af391de8119f2d71c27272b1dfe410cbf. --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 24 +++++++++++- net/smc/smc.h | 2 - net/smc/smc_cdc.c | 14 +------ net/smc/smc_cdc.h | 2 - net/smc/smc_sysctl.c | 9 ----- net/smc/smc_tx.c | 86 +++++++---------------------------------- 7 files changed, 36 insertions(+), 102 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 7f8256f3247d..0c9e1c7feda7 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -26,7 +26,6 @@ struct netns_smc { int sysctl_wmem_default; int sysctl_rmem_default; int sysctl_tcp2smc; - int sysctl_autocorking; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 835731e80413..1221dca53654 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2476,6 +2476,28 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, rc = -EINVAL; } break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (val) { + SMC_STAT_INC(smc, ndly_cnt); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); + } + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (!val) { + SMC_STAT_INC(smc, cork_cnt); + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); + } + } + break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; @@ -2736,7 +2758,6 @@ static __net_init int smc_net_init(struct net *net) net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_autocorking = 1; } return smc_pnet_net_init(net); @@ -2870,7 +2891,6 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; - init_net.smc.sysctl_autocorking = 1; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc.h b/net/smc/smc.h index cf9f92b4c1b9..f158e552f43c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -178,8 +178,6 @@ struct smc_connection { spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ - atomic_t cdc_pend_tx_wr; /* pending tx CDC wr */ - atomic_t tx_pushing; /* num of user trying tx push */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index e4e3cebff922..99acd337ba90 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -51,13 +51,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn); conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - /* If this is the last pending WR complete, push them to prevent - * no one trying to push when corked. - */ - if (likely(!cdcpend->validation) && - atomic_dec_and_test(&conn->cdc_pend_tx_wr)) - smc_tx_sndbuf_nonempty(conn); - smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } @@ -113,18 +106,14 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - atomic_inc(&conn->cdc_pend_tx_wr); - smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (likely(!rc)) { + if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - atomic_dec(&conn->cdc_pend_tx_wr); } return rc; @@ -146,7 +135,6 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ peer->token = htonl(local->token); peer->prod_flags.failover_validation = 1; - pend->validation = 1; rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index b790f2af8607..0a0a89abd38b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -284,8 +284,6 @@ struct smc_cdc_tx_pend { union smc_host_cursor cursor; /* tx sndbuf cursor sent */ union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ u16 ctrl_seq; /* conn. tx sequence # */ - u16 validation:1; - u16 reserved:15; }; int smc_cdc_get_free_slot(struct smc_connection *conn, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a9fca59512bd..e3942837c3e3 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -35,15 +35,6 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "autocorking", - .data = &init_net.smc.sysctl_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 8f1d8cd13191..efe0b393a5fe 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -31,6 +31,7 @@ #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -124,37 +125,11 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) return rc; } -/* Strategy: Nagle algorithm - * 1. The first message should never cork - * 2. If we have any inflight messages, wait for the first - * message back - * 3. The total corked message should not exceed min(64k, sendbuf/2) - */ -static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +static bool smc_tx_is_corked(struct smc_sock *smc) { - struct smc_connection *conn = &smc->conn; - int prepared_send; - - /* First request && no more message should always pass */ - if (atomic_read(&conn->cdc_pend_tx_wr) == 0 && - !(msg->msg_flags & MSG_MORE)) - return false; - - /* If We have enough data in the send queue that have not been - * pushed, send immediately. - * Note, here we only care about the prepared_sends, but not - * sendbuf_space because sendbuf_space has nothing to do with - * corked data size. - */ - prepared_send = smc_tx_prepared_sends(conn); - if (prepared_send > min(64 * 1024, conn->sndbuf_desc->len >> 1)) - return false; - - if (!sock_net(&smc->sk)->smc.sysctl_autocorking) - return false; - - /* All the other conditions should cork */ - return true; + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } /* sndbuf producer: main API called by socket layer. @@ -203,13 +178,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; - /* If our send queue is full but peer have RMBE space, - * we should send them out before wait - */ - if (!atomic_read(&conn->sndbuf_space) && - atomic_read(&conn->peer_rmbe_space) > 0) - smc_tx_sndbuf_nonempty(conn); - if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { @@ -269,17 +237,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if (smc_tx_should_cork(smc, msg)) { + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_desc->len >> 1))) { /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ conn->tx_corked_bytes += copylen; ++conn->tx_corked_cnt; + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_CORK_DELAY); } else { conn->tx_bytes += copylen; ++conn->tx_cnt; - if (delayed_work_pending(&conn->tx_work)) - cancel_delayed_work(&conn->tx_work); smc_tx_sndbuf_nonempty(conn); } @@ -616,31 +586,11 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc = 0; - struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - - /* Only let one to push to prevent wasting of CPU and CDC slot */ - if (atomic_inc_return(&conn->tx_pushing) > 1) - return 0; - -again: - atomic_set(&conn->tx_pushing, 1); - - /* No data in the send queue */ - if (unlikely(smc_tx_prepared_sends(conn) <= 0)) - goto out; - - /* Peer don't have RMBE space */ - if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { - SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); - goto out; - } + int rc; if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { - rc = -EPIPE; /* connection being aborted */ - goto out; - } + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -652,16 +602,6 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) conn); smc_close_wake_tx_prepared(smc); } - -out: - /* We need to check whether someone else have added some data into - * the send queue and tried to push but failed when we are pushing. - * If so, we need to try push again to prevent those data in the - * send queue may never been pushed out - */ - if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) - goto again; - return rc; } -- Gitee From ac78975f4f48b50e76e62cf49d03bae5ce521c11 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:54 +0800 Subject: [PATCH 083/148] Revert "anolis: net/smc: don't call ib_req_notify_cq in the send routine" This reverts commit c148881b88ee27488d5e3a256b2c0886e6a68ecd. --- net/smc/smc_ib.c | 6 ------ net/smc/smc_wr.c | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 55435499dc3c..b414bb12e6ca 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,12 +135,6 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; - - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc) - goto out; - rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 2aebdf49020f..fe7d4e05722e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -325,6 +325,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 741cf3ca3e40bad4f3b5f50134db687739c0616e Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:56 +0800 Subject: [PATCH 084/148] Revert "anolis: net/smc: Add SMC-R link-down counters" This reverts commit 143f02b6923be79ebb34ab38f08cb713390c1735. --- include/uapi/linux/smc_diag.h | 2 -- net/smc/smc_core.c | 14 +++----------- net/smc/smc_core.h | 2 -- net/smc/smc_diag.c | 2 -- net/smc/smc_llc.c | 3 --- net/smc/smc_tx.c | 4 +--- net/smc/smc_wr.c | 5 ----- 7 files changed, 4 insertions(+), 28 deletions(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index b9b7bf4dacc8..182efdd3ec91 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -95,8 +95,6 @@ struct smc_diag_linkinfo { __u8 ibport; /* RDMA device port number */ __u8 gid[40]; /* local GID */ __u8 peer_gid[40]; /* peer GID */ - __u64 link_down_cnt_smc; /* link down caused by SMC-R protocol */ - __u64 link_down_cnt_ib; /* link down caused by IB net device */ }; struct smc_diag_lgrinfo { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e93d3ce74951..e93458e3e7a4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -767,8 +767,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; lnk->link_idx = link_idx; - lnk->link_down_cnt_smc = 0; - lnk->link_down_cnt_ib = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); @@ -1073,20 +1071,16 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) { - ++to_lnk->link_down_cnt_smc; + if (rc) goto err_out; - } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) { - ++to_lnk->link_down_cnt_ib; + if (rc) goto err_out; - } goto again; } read_unlock_bh(&lgr->conns_lock); @@ -1682,10 +1676,8 @@ void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && - lnk->smcibdev == smcibdev && lnk->ibport == ibport) { - ++lnk->link_down_cnt_ib; + lnk->smcibdev == smcibdev && lnk->ibport == ibport) smcr_link_down_cond_sched(lnk); - } } } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 28d0bcb5759f..93e0e6c647ba 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -148,8 +148,6 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ - u64 link_down_cnt_smc; /* smc-caused link down counter */ - u64 link_down_cnt_ib; /* ib-caused link down counter */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index ddecd39aa4a4..1fa7c7cf9332 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -155,8 +155,6 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .role = smc->conn.lgr->role, .lnk[0].ibport = smc->conn.lnk->ibport, .lnk[0].link_id = smc->conn.lnk->link_id, - .lnk[0].link_down_cnt_smc = smc->conn.lnk->link_down_cnt_smc, - .lnk[0].link_down_cnt_ib = smc->conn.lnk->link_down_cnt_ib, }; memcpy(linfo.lnk[0].ibname, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index b8587cb50300..b74342c8433e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1281,14 +1281,12 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_NO_ASYM_NEEDED); if (rc) { - ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, SMC_LLC_DELETE_LINK); if (!qentry) { - ++lnk_new->link_down_cnt_ib; smcr_link_down_cond(lnk_new); goto out_free; } @@ -2102,7 +2100,6 @@ static void smc_llc_testlink_work(struct work_struct *work) if (!smc_link_active(link)) return; /* link state changed */ if (rc <= 0) { - ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); return; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index efe0b393a5fe..82735741bc2a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -304,10 +304,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - ++link->link_down_cnt_ib; + if (rc) smcr_link_down_cond_sched(link); - } return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index fe7d4e05722e..600ab5889227 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -142,7 +142,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2)); } - ++link->link_down_cnt_ib; /* terminate link */ smcr_link_down_cond_sched(link); } @@ -237,7 +236,6 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { - ++link->link_down_cnt_smc; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -347,7 +345,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); } return rc; @@ -402,7 +399,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (atomic_dec_and_test(&link->wr_reg_refcnt)) wake_up_all(&link->wr_reg_wait); if (!rc) { - ++link->link_down_cnt_ib; /* timeout - terminate link */ smcr_link_down_cond_sched(link); return -EPIPE; @@ -502,7 +498,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: - ++link->link_down_cnt_ib; smcr_link_down_cond_sched(link); break; default: -- Gitee From d39098978598ac1a284a7e5d6979c562d727ff74 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:25:58 +0800 Subject: [PATCH 085/148] Revert "anolis: net/smc: Add TX and RX diagnosis information" This reverts commit 441b27cc6a54629339f72af47a827945cfd35e9a. --- include/uapi/linux/smc_diag.h | 6 ------ net/smc/smc.h | 6 ------ net/smc/smc_core.c | 15 --------------- net/smc/smc_diag.c | 6 ------ net/smc/smc_rx.c | 2 -- net/smc/smc_tx.c | 9 ++------- 6 files changed, 2 insertions(+), 42 deletions(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 182efdd3ec91..8cb3a6fef553 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -79,12 +79,6 @@ struct smc_diag_conninfo { struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ struct smc_diag_cursor tx_sent; /* sent cursor */ struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ - __u64 rx_cnt; /* rx counter */ - __u64 tx_cnt; /* tx counter */ - __u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - __u64 rx_bytes; /* rx size */ - __u64 tx_bytes; /* tx size */ - __u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ }; /* SMC_DIAG_LINKINFO */ diff --git a/net/smc/smc.h b/net/smc/smc.h index f158e552f43c..f794e3fc4d43 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -210,12 +210,6 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ - u64 rx_cnt; /* rx counter */ - u64 tx_cnt; /* tx counter */ - u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - u64 rx_bytes; /* rx size */ - u64 tx_bytes; /* tx size */ - u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 out_of_sync : 1; /* out of sync with peer */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e93458e3e7a4..368b0bc5064c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1774,20 +1774,6 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } -static void smc_rx_tx_counter_init(struct smc_connection *conn) -{ - /* Initialize RX & TX diagnostic inform for each - * connection. These counters mean what smc wants - * net devices "TODO" insead of what has been "DONE" - */ - conn->rx_cnt = 0; - conn->tx_cnt = 0; - conn->tx_corked_cnt = 0; - conn->rx_bytes = 0; - conn->tx_bytes = 0; - conn->tx_corked_bytes = 0; -} - /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1863,7 +1849,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; - smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 1fa7c7cf9332..40036e9926e0 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,12 +136,6 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, - .rx_cnt = conn->rx_cnt, - .tx_cnt = conn->tx_cnt, - .tx_corked_cnt = conn->tx_corked_cnt, - .rx_bytes = conn->rx_bytes, - .tx_bytes = conn->tx_bytes, - .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index bf353c68323d..51e8eb2933ff 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -392,7 +392,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); - conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -442,7 +441,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); - ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 82735741bc2a..02d147bde78c 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -239,19 +239,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) { + (conn->sndbuf_desc->len >> 1))) /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ - conn->tx_corked_bytes += copylen; - ++conn->tx_corked_cnt; queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_CORK_DELAY); - } else { - conn->tx_bytes += copylen; - ++conn->tx_cnt; + else smc_tx_sndbuf_nonempty(conn); - } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From a16f9853075873daf08d4ec16a549b3d3f5a61df Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:26:05 +0800 Subject: [PATCH 086/148] Revert "anolis: net/smc: Introduce TCP to SMC replacement netlink commands" This reverts commit 76ac359b14c9bd7a90145f06b1fcce407e13734d. --- include/net/netns/smc.h | 8 -- include/uapi/linux/smc.h | 3 - net/smc/Makefile | 2 +- net/smc/af_smc.c | 12 +-- net/smc/smc_conv.c | 186 --------------------------------------- net/smc/smc_conv.h | 22 ----- net/smc/smc_netlink.c | 19 +--- net/smc/smc_netlink.h | 5 -- net/socket.c | 39 ++------ 9 files changed, 9 insertions(+), 287 deletions(-) delete mode 100644 net/smc/smc_conv.c delete mode 100644 net/smc/smc_conv.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 0c9e1c7feda7..322203b2ac21 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,20 +6,12 @@ struct smc_stats_rsn; struct smc_stats; -struct smc_convert { - int wlist_len; - struct mutex wlist_lock; - struct list_head wlist; - int (*smc_conv_match_rcu)(struct net *net, char *comm); -}; - struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index b69bd17f6a52..20f33b27787f 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -59,9 +59,6 @@ enum { SMC_NETLINK_DUMP_SEID, SMC_NETLINK_ENABLE_SEID, SMC_NETLINK_DISABLE_SEID, - SMC_NETLINK_ADD_TCP2SMC_WLIST, - SMC_NETLINK_DEL_TCP2SMC_WLIST, - SMC_NETLINK_GET_TCP2SMC_WLIST, }; /* SMC_GENL_FAMILY top level attributes */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 72b3c934e473..19076ff20d58 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o smc_conv.o +smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1221dca53654..4d1493a5aa92 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,7 +53,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_proc.h" -#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -2872,16 +2871,10 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_conv_init(); - if (rc) { - pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); - goto out_proc; - } - rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_conv; + goto out_proc; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -2899,8 +2892,6 @@ static int __init smc_init(void) static_branch_enable(&tcp_have_smc); return 0; -out_conv: - smc_conv_exit(); out_proc: smc_proc_exit(); out_sock: @@ -2928,7 +2919,6 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); - smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c deleted file mode 100644 index e1f87d1de8a5..000000000000 --- a/net/smc/smc_conv.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include -#include "smc_netlink.h" -#include "smc_conv.h" - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *wlist_elem, *tmp; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - if (*wlist_len >= SMC_MAX_WLIST_LEN) { - mutex_unlock(wlist_lock); - return -EINVAL; - } - - list_for_each_entry(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) - goto out; - } - - wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); - if (!wlist_elem) { - mutex_unlock(wlist_lock); - return -ENOMEM; - } - - strcpy(wlist_elem->task_comm, msg); - list_add_tail_rcu(&wlist_elem->list, wlist); - ++*wlist_len; -out: - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *tmp, *nxt; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - list_for_each_entry_safe(tmp, nxt, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) { - list_del_rcu(&tmp->list); - synchronize_rcu(); - kfree(tmp); - --*wlist_len; - break; - } - } - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); - struct smc_conv_wlist_elem *tmp; - void *nlh; - - if (cb_ctx->pos[0]) - goto errmsg; - - nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &smc_gen_nl_family, NLM_F_MULTI, - SMC_NETLINK_GET_TCP2SMC_WLIST); - if (!nlh) - goto errmsg; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, - nla_total_size(strlen(tmp->task_comm) + 1), - tmp->task_comm)) { - rcu_read_unlock(); - goto errattr; - } - } - rcu_read_unlock(); - - genlmsg_end(skb, nlh); - cb_ctx->pos[0] = 1; - return skb->len; - -errattr: - genlmsg_cancel(skb, nlh); -errmsg: - return skb->len; -} - -static int smc_match_tcp2smc_wlist(struct net *net, char *comm) -{ - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_conv_wlist_elem *tmp; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, comm)) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - return -1; -} - -static int __net_init smc_net_conv_init(struct net *net) -{ - INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); - net->smc.smc_conv.wlist_len = 0; - - mutex_init(&net->smc.smc_conv.wlist_lock); - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, - smc_match_tcp2smc_wlist); - return 0; -} - -static void __net_exit smc_net_conv_exit(struct net *net) -{ - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *cur, *nxt; - struct list_head tmp_list; - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); - synchronize_rcu(); - - INIT_LIST_HEAD(&tmp_list); - - mutex_lock(wlist_lock); - list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); - *wlist_len = 0; - mutex_unlock(wlist_lock); - - list_for_each_entry_safe(cur, nxt, &tmp_list, list) { - list_del(&cur->list); - kfree(cur); - } -} - -static struct pernet_operations smc_conv_ops = { - .init = smc_net_conv_init, - .exit = smc_net_conv_exit, -}; - -int __init smc_conv_init(void) -{ - return register_pernet_subsys(&smc_conv_ops); -} - -void smc_conv_exit(void) -{ - unregister_pernet_subsys(&smc_conv_ops); -} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h deleted file mode 100644 index 1615b27feede..000000000000 --- a/net/smc/smc_conv.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef NET_SMC_SMC_CONV_H_ -#define NET_SMC_SMC_CONV_H_ -#include -#include -#include - -#define SMC_MAX_WLIST_LEN 32 - -struct smc_conv_wlist_elem { - char task_comm[TASK_COMM_LEN]; - struct list_head list; -}; - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); -int __init smc_conv_init(void); -void smc_conv_exit(void); - -#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index f2007aa124cf..f13ab0661ed5 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,7 +22,6 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" -#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -112,25 +111,9 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_seid, }, - { - .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_add_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_del_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .dumpit = smc_nl_get_tcp2smc_wlist, - }, }; -static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { - [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, +static const struct nla_policy smc_gen_nl_policy[2] = { [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index aae13737095e..e8c6c3f0e98c 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,11 +15,6 @@ #include #include -enum { - SMC_CMD_ATTR_TCP2SMC = 1, - SMC_CMD_MAX_ATTR, -}; - extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; diff --git a/net/socket.c b/net/socket.c index 3917e02b2b2f..96860a0f9330 100644 --- a/net/socket.c +++ b/net/socket.c @@ -141,38 +141,6 @@ static void sock_show_fdinfo(struct seq_file *m, struct file *f) #define sock_show_fdinfo NULL #endif -#if IS_ENABLED(CONFIG_SMC) -static bool try_tcp2smc_convert(struct net *net, int *family, int type, - int *protocol, int kern) -{ - int (*f)(struct net *n, char *c) = NULL; - - /* Only convert userspace socket */ - if (kern) - return false; - - if ((*family == AF_INET || *family == AF_INET6) && - type == SOCK_STREAM && - (*protocol == IPPROTO_IP || *protocol == IPPROTO_TCP)) { - if (net->smc.sysctl_tcp2smc) - goto convert; - - rcu_read_lock(); - f = rcu_dereference(net->smc.smc_conv.smc_conv_match_rcu); - if (f && !f(net, current->comm)) { - rcu_read_unlock(); - goto convert; - } - rcu_read_unlock(); - } - return false; -convert: - *protocol = (*family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - *family = AF_SMC; - return true; -} -#endif - /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -1400,7 +1368,12 @@ int __sock_create(struct net *net, int family, int type, int protocol, family = PF_PACKET; } #if IS_ENABLED(CONFIG_SMC) - try_tcp2smc_convert(net, &family, type, &protocol, kern); + if (!kern && (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } #endif err = security_socket_create(family, type, protocol, kern); -- Gitee From b2c57b7345cbca2637273c238c3f2470c7944a34 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:26:07 +0800 Subject: [PATCH 087/148] Revert "anolis: net/smc: Introduce SMC-R-related proc files" This reverts commit 870ceb456f097c2d22dd1fb45d399703466048a5. --- include/net/net_namespace.h | 1 - include/net/smc.h | 5 +- net/smc/Makefile | 2 +- net/smc/af_smc.c | 27 +--- net/smc/smc_diag.c | 29 ++-- net/smc/smc_proc.c | 287 ------------------------------------ net/smc/smc_proc.h | 34 ----- 7 files changed, 22 insertions(+), 363 deletions(-) delete mode 100644 net/smc/smc_proc.c delete mode 100644 net/smc/smc_proc.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 220878bfe86b..76e9cce289a4 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,7 +95,6 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; - struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/smc.h b/include/net/smc.h index 743b4fe74346..e441aa97ad61 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -12,13 +12,10 @@ #define _SMC_H #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ -#define SMC_HTABLE_SHIFT 9 -#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { - unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht[SMC_HTABLE_SIZE]; + struct hlist_head ht; }; int smc_hash_sk(struct sock *sk); diff --git a/net/smc/Makefile b/net/smc/Makefile index 19076ff20d58..640af9a39f9c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o +smc-y += smc_tracepoint.o smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4d1493a5aa92..ed551eb721c7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -52,7 +52,6 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" -#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -87,13 +86,11 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - write_lock_bh(&h->lock); - - head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + head = &h->ht; + write_lock_bh(&h->lock); sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock_bh(&h->lock); return 0; @@ -2792,7 +2789,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc, i; + int rc; int max_rshare, max_wshare; unsigned long limit; @@ -2859,22 +2856,13 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - - for (i = 0; i < SMC_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); - } - - rc = smc_proc_init(); - if (rc) { - pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); - goto out_sock; - } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_proc; + goto out_sock; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -2892,8 +2880,6 @@ static int __init smc_init(void) static_branch_enable(&tcp_have_smc); return 0; -out_proc: - smc_proc_exit(); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -2919,7 +2905,6 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); - smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 40036e9926e0..c952986a6aca 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,25 +196,24 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0, slot; + int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - - for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { - head = &prot->h.smc_hash->ht[slot]; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + head = &prot->h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; - } + num++; } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c deleted file mode 100644 index 19d8cc82a7ac..000000000000 --- a/net/smc/smc_proc.c +++ /dev/null @@ -1,287 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include "smc.h" -#include "smc_proc.h" -#include "smc_core.h" - -static void *smc_get_next(struct seq_file *seq, void *cur) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - struct net *net = seq_file_net(seq); - struct hlist_head *head; - struct sock *sk = cur; - - if (!sk) { - read_lock(&smc_hash->lock); -get_head: - head = &smc_hash->ht[sp->bucket]; - sk = sk_head(head); - sp->offset = 0; - goto get_sk; - } - ++sp->num; - ++sp->offset; - - sk = sk_next(sk); -get_sk: - sk_for_each_from(sk) { - if (!net_eq(sock_net(sk), net)) - continue; - return sk; - } - sp->offset = 0; - if (++sp->bucket < SMC_HTABLE_SIZE) - goto get_head; - - read_unlock(&smc_hash->lock); - return NULL; -} - -static void *smc_seek_last_pos(struct seq_file *seq) -{ - struct smc_proc_private *sp = seq->private; - int offset = sp->offset; - int orig_num = sp->num; - void *rc = NULL; - - if (sp->bucket >= SMC_HTABLE_SIZE) - goto out; - - rc = smc_get_next(seq, NULL); - while (offset-- && rc) - rc = smc_get_next(seq, rc); - - if (rc) - goto out; - - sp->bucket = 0; -out: - sp->num = orig_num; - return rc; -} - -static void *smc_get_idx(struct seq_file *seq, loff_t pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - sp->bucket = 0; - rc = smc_get_next(seq, NULL); - - while (rc && pos) { - rc = smc_get_next(seq, rc); - --pos; - } - return rc; -} - -static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - if (*pos && *pos == sp->last_pos) { - rc = smc_seek_last_pos(seq); - if (rc) - goto out; - } - - sp->num = 0; - sp->bucket = 0; - sp->offset = 0; - sp->protocol = protocol; - rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - -out: - sp->last_pos = *pos; - return rc; -} - -static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC); -} - -static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC6); -} - -static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) -{ - struct smc_proc_private *sp = seq->private; - const struct in6_addr *dest, *src; - struct smc_link_group *lgr; - struct socket *clcsock; - struct smc_link *lnk; - struct sock *sk; - bool fb = false; - int i; - - fb = smc->use_fallback; - clcsock = smc->clcsock; - sk = &smc->sk; - - if (protocol == SMCPROTO_SMC) - seq_printf(seq, CONN4_ADDR_FM, sp->num, - clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, - clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); - else if (protocol == SMCPROTO_SMC6) { - dest = &clcsock->sk->sk_v6_daddr; - src = &clcsock->sk->sk_v6_rcv_saddr; - seq_printf(seq, CONN6_ADDR_FM, sp->num, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); - } - - seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, - sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); - - lgr = smc->conn.lgr; - lnk = smc->conn.lnk; - - if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { - for (i = 0; i < SMC_LGR_ID_SIZE; i++) - seq_printf(seq, "%02X", lgr->id[i]); - - seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', - lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); - } else { - seq_puts(seq, "- - - - - - - -\n"); - } -} - -static int smc_conn_show(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct socket *clcsock; - struct smc_sock *smc; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, - "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", - "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); - goto out; - } - - smc = smc_sk(v); - clcsock = smc->clcsock; - if (!clcsock) - goto out; - - _conn_show(seq, smc, sp->protocol); -out: - return 0; -} - -static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc = NULL; - - if (v == SEQ_START_TOKEN) { - rc = smc_get_idx(seq, 0); - goto out; - } - rc = smc_get_next(seq, v); -out: - ++*pos; - sp->last_pos = *pos; - return rc; -} - -static void smc_conn_stop(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - - if (v && v != SEQ_START_TOKEN) - read_unlock(&smc_hash->lock); -} - -static struct smc_proc_entry smc_proc[] = { - { - .name = "smc4", - .ops = { - .show = smc_conn_show, - .start = smc_conn4_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .name = "smc6", - .ops = { - .show = smc_conn_show, - .start = smc_conn6_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#endif -}; - -static int __net_init smc_proc_dir_init(struct net *net) -{ - int i, rc = -ENOMEM; - - net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); - if (!net->proc_net_smc) - goto err; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { - if (!proc_create_net_data(smc_proc[i].name, 0444, - net->proc_net_smc, &smc_proc[i].ops, - sizeof(struct smc_proc_private), - NULL)) - goto err_entry; - } - - return 0; - -err_entry: - for (i -= 1; i >= 0; i--) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -err: - return rc; -} - -static void __net_exit smc_proc_dir_exit(struct net *net) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -} - -static struct pernet_operations smc_proc_ops = { - .init = smc_proc_dir_init, - .exit = smc_proc_dir_exit, -}; - -int __init smc_proc_init(void) -{ - return register_pernet_subsys(&smc_proc_ops); -} - -void smc_proc_exit(void) -{ - unregister_pernet_subsys(&smc_proc_ops); -} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h deleted file mode 100644 index ec59ca03e163..000000000000 --- a/net/smc/smc_proc.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _SMC_PROC_H_ -#define _SMC_PROC_H_ - -#include -#include -#include -#include -#include -#include "smc.h" - -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") -#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") - -struct smc_proc_private { - struct seq_net_private p; - int num, bucket, offset; - int protocol; - loff_t last_pos; -}; - -struct smc_proc_entry { - const char *name; - const struct seq_operations ops; -}; - -int __init smc_proc_init(void); -void smc_proc_exit(void); - -#endif -- Gitee From 5de76bab7e44c56bd411b43b692804bfa4745225 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:26:09 +0800 Subject: [PATCH 088/148] Revert "anolis: net/smc: Introduce sysctl tcp2smc" This reverts commit 98361b31785180a76c03fe27e43b69e19281e7ac. --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 3 --- net/smc/smc_sysctl.c | 7 ------- net/socket.c | 8 -------- 4 files changed, 19 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 322203b2ac21..25bf418989aa 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,7 +17,6 @@ struct netns_smc { #endif int sysctl_wmem_default; int sysctl_rmem_default; - int sysctl_tcp2smc; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ed551eb721c7..e7cb32904ba3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2753,7 +2753,6 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; - net->smc.sysctl_tcp2smc = 0; } return smc_pnet_net_init(net); @@ -2761,7 +2760,6 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { - net->smc.sysctl_tcp2smc = 0; smc_pnet_net_exit(net); } @@ -2871,7 +2869,6 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; - init_net.smc.sysctl_tcp2smc = 0; #ifdef CONFIG_SYSCTL smc_sysctl_init(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e3942837c3e3..317b37095c4f 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -28,13 +28,6 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, - { - .procname = "tcp2smc", - .data = &init_net.smc.sysctl_tcp2smc, - .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; diff --git a/net/socket.c b/net/socket.c index 96860a0f9330..d52c265ad449 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1367,14 +1367,6 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } -#if IS_ENABLED(CONFIG_SMC) - if (!kern && (family == AF_INET || family == AF_INET6) && - type == SOCK_STREAM && (protocol == IPPROTO_IP || - protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { - protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - family = AF_SMC; - } -#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From 09247915a406abccfaeac50ad9f0b8f0eaad5d94 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:26:11 +0800 Subject: [PATCH 089/148] Revert "anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace" This reverts commit 2a0084cc5a06a8d8e5acad13ad9991a3cd6f33ff. --- include/uapi/linux/in.h | 3 --- include/uapi/linux/in6.h | 2 -- net/smc/smc.h | 4 ++++ 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 40b1e51b18c9..d1b327036ae4 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,9 +84,6 @@ enum { }; #endif -/* SMC protocol, IPv4 */ -#define SMCPROTO_SMC 0 - #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 6c21c85be0e3..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,8 +95,6 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 -/* SMC protocol, IPv6 */ -#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/smc.h b/net/smc/smc.h index f794e3fc4d43..1505df7d98c4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,6 +21,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 + +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ + #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From 3b98801cbea284587e1ad0d2219891dcf43b8544 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 09:26:13 +0800 Subject: [PATCH 090/148] Revert "anolis: net/smc: Introduce tunable sysctls for sndbuf and RMB size" This reverts commit bce77db96f1b99b9641ca1ffc7585c5a8a26165c. --- include/net/netns/smc.h | 6 --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 28 +------------- net/smc/smc.h | 5 --- net/smc/smc_sysctl.c | 81 ----------------------------------------- 5 files changed, 3 insertions(+), 119 deletions(-) delete mode 100644 net/smc/smc_sysctl.c diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 25bf418989aa..ea8a9cf2619b 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -12,11 +12,5 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; -#ifdef CONFIG_SYSCTL - struct ctl_table_header *smc_hdr; -#endif - int sysctl_wmem_default; - int sysctl_rmem_default; }; - #endif diff --git a/net/smc/Makefile b/net/smc/Makefile index 640af9a39f9c..196fb6f01b14 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o +smc-y += smc_tracepoint.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e7cb32904ba3..41eda98d153b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -244,8 +243,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - sk->sk_sndbuf = net->smc.sysctl_wmem_default; - sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -2733,6 +2730,8 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); out: return rc; @@ -2748,13 +2747,6 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { - if (net != &init_net) { - net->smc.sysctl_wmem_default = - init_net.smc.sysctl_rmem_default; - net->smc.sysctl_rmem_default = - init_net.smc.sysctl_rmem_default; - } - return smc_pnet_net_init(net); } @@ -2788,8 +2780,6 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { int rc; - int max_rshare, max_wshare; - unsigned long limit; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -2863,17 +2853,6 @@ static int __init smc_init(void) goto out_sock; } - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL * 1024 * 1024, limit); - max_rshare = min(6UL * 1024 * 1024, limit); - - init_net.smc.sysctl_wmem_default = 256 * 1024; - init_net.smc.sysctl_rmem_default = 384 * 1024; - -#ifdef CONFIG_SYSCTL - smc_sysctl_init(); -#endif - static_branch_enable(&tcp_have_smc); return 0; @@ -2914,9 +2893,6 @@ static void __exit smc_exit(void) smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); -#ifdef CONFIG_SYSCTL - smc_sysctl_exit(); -#endif rcu_barrier(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index 1505df7d98c4..f4286ca1f228 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -307,9 +307,4 @@ void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); -#ifdef CONFIG_SYSCTL -int smc_sysctl_init(void); -void smc_sysctl_exit(void); -#endif - #endif /* __SMC_H */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c deleted file mode 100644 index 317b37095c4f..000000000000 --- a/net/smc/smc_sysctl.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include - -#include "smc_core.h" - -static int min_sndbuf = SMC_BUF_MIN_SIZE; -static int min_rcvbuf = SMC_BUF_MIN_SIZE; - -static struct ctl_table smc_table[] = { - { - .procname = "wmem_default", - .data = &init_net.smc.sysctl_wmem_default, - .maxlen = sizeof(init_net.smc.sysctl_wmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &init_net.smc.sysctl_rmem_default, - .maxlen = sizeof(init_net.smc.sysctl_rmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { } -}; - -static __net_init int smc_sysctl_init_net(struct net *net) -{ - struct ctl_table *table; - - table = smc_table; - if (!net_eq(net, &init_net)) { - int i; - - table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); - if (!table) - goto err_alloc; - - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) - table[i].data += (void *)net - (void *)&init_net; - } - - net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); - if (!net->smc.smc_hdr) - goto err_reg; - - return 0; - -err_reg: - if (!net_eq(net, &init_net)) - kfree(table); -err_alloc: - return -ENOMEM; -} - -static __net_exit void smc_sysctl_exit_net(struct net *net) -{ - unregister_net_sysctl_table(net->smc.smc_hdr); -} - -static struct pernet_operations smc_sysctl_ops __net_initdata = { - .init = smc_sysctl_init_net, - .exit = smc_sysctl_exit_net, -}; - -int __init smc_sysctl_init(void) -{ - return register_pernet_subsys(&smc_sysctl_ops); -} - -void smc_sysctl_exit(void) -{ - unregister_pernet_subsys(&smc_sysctl_ops); -} -- Gitee From 3f3da961e549621ab80962e10eaa2405ae1073b9 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 15 Dec 2021 20:29:21 +0800 Subject: [PATCH 091/148] net/smc: Prevent smc_release() from long blocking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In nginx/wrk benchmark, there's a hung problem with high probability on case likes that: (client will last several minutes to exit) server: smc_run nginx client: smc_run wrk -c 10000 -t 1 http://server Client hangs with the following backtrace: 0 [ffffa7ce8Of3bbf8] __schedule at ffffffff9f9eOd5f 1 [ffffa7ce8Of3bc88] schedule at ffffffff9f9eløe6 2 [ffffa7ce8Of3bcaO] schedule_timeout at ffffffff9f9e3f3c 3 [ffffa7ce8Of3bd2O] wait_for_common at ffffffff9f9el9de 4 [ffffa7ce8Of3bd8O] __flush_work at ffffffff9fOfeOl3 5 [ffffa7ce8øf3bdfO] smc_release at ffffffffcO697d24 [smc] 6 [ffffa7ce8Of3be2O] __sock_release at ffffffff9f8O2e2d 7 [ffffa7ce8Of3be4ø] sock_close at ffffffff9f8ø2ebl 8 [ffffa7ce8øf3be48] __fput at ffffffff9f334f93 9 [ffffa7ce8Of3be78] task_work_run at ffffffff9flOlff5 10 [ffffa7ce8Of3beaO] do_exit at ffffffff9fOe5Ol2 11 [ffffa7ce8Of3bflO] do_group_exit at ffffffff9fOe592a 12 [ffffa7ce8Of3bf38] __x64_sys_exit_group at ffffffff9fOe5994 13 [ffffa7ce8Of3bf4O] do_syscall_64 at ffffffff9f9d4373 14 [ffffa7ce8Of3bfsO] entry_SYSCALL_64_after_hwframe at ffffffff9fa0007c This issue dues to flush_work(), which is used to wait for smc_connect_work() to finish in smc_release(). Once lots of smc_connect_work() was pending or all executing work dangling, smc_release() has to block until one worker comes to free, which is equivalent to wait another smc_connnect_work() to finish. In order to fix this, There are two changes: 1. For those idle smc_connect_work(), cancel it from the workqueue; for executing smc_connect_work(), waiting for it to finish. For that purpose, replace flush_work() with cancel_work_sync(). 2. Since smc_connect() hold a reference for passive closing, if smc_connect_work() has been cancelled, release the reference. Fixes: 24ac3a08e658 ("net/smc: rebuild nonblocking connect") Reported-by: Tony Lu Tested-by: Dust Li Reviewed-by: Dust Li Reviewed-by: Tony Lu Signed-off-by: D. Wythe Acked-by: Karsten Graul Link: https://lore.kernel.org/r/1639571361-101128-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 41eda98d153b..c1583df1cd3f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -194,7 +194,9 @@ static int smc_release(struct socket *sock) /* cleanup for a dangling non-blocking connect */ if (smc->connect_nonblock && sk->sk_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - flush_work(&smc->connect_work); + + if (cancel_work_sync(&smc->connect_work)) + sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires -- Gitee From c9740bae510e2405a90dbbc9d2a4377b1d78d39e Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 27 Dec 2021 14:35:30 +0100 Subject: [PATCH 092/148] net/smc: fix using of uninitialized completions In smc_wr_tx_send_wait() the completion on index specified by pend->idx is initialized and after smc_wr_tx_send() was called the wait for completion starts. pend->idx is used to get the correct index for the wait, but the pend structure could already be cleared in smc_wr_tx_process_cqe(). Introduce pnd_idx to hold and use a local copy of the correct index. Fixes: 09c61d24f96d ("net/smc: wait for departure of an IB message") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 600ab5889227..79a7431f534e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -358,18 +358,20 @@ int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout) { struct smc_wr_tx_pend *pend; + u32 pnd_idx; int rc; pend = container_of(priv, struct smc_wr_tx_pend, priv); pend->compl_requested = 1; - init_completion(&link->wr_tx_compl[pend->idx]); + pnd_idx = pend->idx; + init_completion(&link->wr_tx_compl[pnd_idx]); rc = smc_wr_tx_send(link, priv); if (rc) return rc; /* wait for completion by smc_wr_tx_process_cqe() */ rc = wait_for_completion_interruptible_timeout( - &link->wr_tx_compl[pend->idx], timeout); + &link->wr_tx_compl[pnd_idx], timeout); if (rc <= 0) rc = -ENODATA; if (rc > 0) -- Gitee From 6c50d8e0a43ce3a63b509e9eb0020a0fd0013184 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 28 Dec 2021 17:03:24 +0800 Subject: [PATCH 093/148] net/smc: don't send CDC/LLC message if link not ready We found smc_llc_send_link_delete_all() sometimes wait for 2s timeout when testing with RDMA link up/down. It is possible when a smc_link is in ACTIVATING state, the underlaying QP is still in RESET or RTR state, which cannot send any messages out. smc_llc_send_link_delete_all() use smc_link_usable() to checks whether the link is usable, if the QP is still in RESET or RTR state, but the smc_link is in ACTIVATING, this LLC message will always fail without any CQE entering the CQ, and we will always wait 2s before timeout. Since we cannot send any messages through the QP before the QP enter RTS. I add a wrapper smc_link_sendable() which checks the state of QP along with the link state. And replace smc_link_usable() with smc_link_sendable() in all LLC & CDC message sending routine. Fixes: 5f08318f617b ("smc: connection data control (CDC)") Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- net/smc/smc_core.h | 6 ++++++ net/smc/smc_llc.c | 2 +- net/smc/smc_wr.c | 4 ++-- net/smc/smc_wr.h | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 368b0bc5064c..343afbdafb98 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -647,7 +647,7 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (smc_link_usable(lnk)) + if (smc_link_sendable(lnk)) lnk->state = SMC_LNK_INACTIVE; } wake_up_all(&lgr->llc_msg_waiter); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 93e0e6c647ba..f2a12d79793f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -416,6 +416,12 @@ static inline bool smc_link_usable(struct smc_link *lnk) return true; } +static inline bool smc_link_sendable(struct smc_link *lnk) +{ + return smc_link_usable(lnk) && + lnk->qp_attr.cur_qp_state == IB_QPS_RTS; +} + static inline bool smc_link_active(struct smc_link *lnk) { return lnk->state == SMC_LNK_ACTIVE; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index b74342c8433e..1d8dafa1a35e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1632,7 +1632,7 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) delllc.reason = htonl(rsn); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&lgr->lnk[i])) + if (!smc_link_sendable(&lgr->lnk[i])) continue; if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) break; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 79a7431f534e..df1dc225cbab 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -188,7 +188,7 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) { *idx = link->wr_tx_cnt; - if (!smc_link_usable(link)) + if (!smc_link_sendable(link)) return -ENOLINK; for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) @@ -231,7 +231,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - !smc_link_usable(link) || + !smc_link_sendable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index f353311e6f84..48ed9b08ac7a 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -62,7 +62,7 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) static inline bool smc_wr_tx_link_hold(struct smc_link *link) { - if (!smc_link_usable(link)) + if (!smc_link_sendable(link)) return false; atomic_inc(&link->wr_tx_refcnt); return true; -- Gitee From 155c7a346e1028ff807263f6995af4ee4e1bd75b Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 28 Dec 2021 17:03:25 +0800 Subject: [PATCH 094/148] net/smc: fix kernel panic caused by race of smc_sock A crash occurs when smc_cdc_tx_handler() tries to access smc_sock but smc_release() has already freed it. [ 4570.695099] BUG: unable to handle page fault for address: 000000002eae9e88 [ 4570.696048] #PF: supervisor write access in kernel mode [ 4570.696728] #PF: error_code(0x0002) - not-present page [ 4570.697401] PGD 0 P4D 0 [ 4570.697716] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 4570.698228] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-rc4+ #111 [ 4570.699013] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/0 [ 4570.699933] RIP: 0010:_raw_spin_lock+0x1a/0x30 <...> [ 4570.711446] Call Trace: [ 4570.711746] [ 4570.711992] smc_cdc_tx_handler+0x41/0xc0 [ 4570.712470] smc_wr_tx_tasklet_fn+0x213/0x560 [ 4570.712981] ? smc_cdc_tx_dismisser+0x10/0x10 [ 4570.713489] tasklet_action_common.isra.17+0x66/0x140 [ 4570.714083] __do_softirq+0x123/0x2f4 [ 4570.714521] irq_exit_rcu+0xc4/0xf0 [ 4570.714934] common_interrupt+0xba/0xe0 Though smc_cdc_tx_handler() checked the existence of smc connection, smc_release() may have already dismissed and released the smc socket before smc_cdc_tx_handler() further visits it. smc_cdc_tx_handler() |smc_release() if (!conn) | | |smc_cdc_tx_dismiss_slots() | smc_cdc_tx_dismisser() | |sock_put(&smc->sk) <- last sock_put, | smc_sock freed bh_lock_sock(&smc->sk) (panic) | To make sure we won't receive any CDC messages after we free the smc_sock, add a refcount on the smc_connection for inflight CDC message(posted to the QP but haven't received related CQE), and don't release the smc_connection until all the inflight CDC messages haven been done, for both success or failed ones. Using refcount on CDC messages brings another problem: when the link is going to be destroyed, smcr_link_clear() will reset the QP, which then remove all the pending CQEs related to the QP in the CQ. To make sure all the CQEs will always come back so the refcount on the smc_connection can always reach 0, smc_ib_modify_qp_reset() was replaced by smc_ib_modify_qp_error(). And remove the timeout in smc_wr_tx_wait_no_pending_sends() since we need to wait for all pending WQEs done, or we may encounter use-after- free when handling CQEs. For IB device removal routine, we need to wait for all the QPs on that device been destroyed before we can destroy CQs on the device, or the refcount on smc_connection won't reach 0 and smc_sock cannot be released. Fixes: 5f08318f617b ("smc: connection data control (CDC)") Reported-by: Wen Gu Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc.h | 5 +++++ net/smc/smc_cdc.c | 52 +++++++++++++++++++++------------------------- net/smc/smc_cdc.h | 2 +- net/smc/smc_core.c | 25 +++++++++++++++++----- net/smc/smc_ib.c | 4 ++-- net/smc/smc_ib.h | 1 + net/smc/smc_wr.c | 41 +++--------------------------------- net/smc/smc_wr.h | 3 +-- 8 files changed, 57 insertions(+), 76 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index f4286ca1f228..1a4fc1c6c4ab 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -180,6 +180,11 @@ struct smc_connection { u16 tx_cdc_seq; /* sequence # for CDC send */ u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ + atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe + * - inc when post wqe, + * - dec on polled tx cqe + */ + wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 99acd337ba90..84c8a4374fdd 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, struct smc_sock *smc; int diff; - if (!conn) - /* already dismissed */ - return; - smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { @@ -51,6 +47,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn); conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } + + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && + unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); + smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } @@ -107,6 +109,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); @@ -114,6 +120,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + atomic_dec(&conn->cdc_pend_tx_wr); } return rc; @@ -136,7 +143,18 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, peer->token = htonl(local->token); peer->prod_flags.failover_validation = 1; + /* We need to set pend->conn here to make sure smc_cdc_tx_handler() + * can handle properly + */ + smc_cdc_add_pending_send(conn, pend); + + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + if (unlikely(rc)) + atomic_dec(&conn->cdc_pend_tx_wr); + return rc; } @@ -193,31 +211,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) return rc; } -static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, - unsigned long data) +void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) { - struct smc_connection *conn = (struct smc_connection *)data; - struct smc_cdc_tx_pend *cdc_pend = - (struct smc_cdc_tx_pend *)tx_pend; - - return cdc_pend->conn == conn; -} - -static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) -{ - struct smc_cdc_tx_pend *cdc_pend = - (struct smc_cdc_tx_pend *)tx_pend; - - cdc_pend->conn = NULL; -} - -void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) -{ - struct smc_link *link = conn->lnk; - - smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, - smc_cdc_tx_filter, smc_cdc_tx_dismisser, - (unsigned long)conn); + wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); } /* Send a SMC-D CDC header. diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 0a0a89abd38b..696cc11f2303 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -291,7 +291,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); -void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); +void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 343afbdafb98..3c73a665d110 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1152,7 +1152,7 @@ void smc_conn_free(struct smc_connection *conn) smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { - smc_cdc_tx_dismiss_slots(conn); + smc_cdc_wait_pend_tx_wr(conn); if (current_work() != &conn->abort_work) cancel_work_sync(&conn->abort_work); } @@ -1229,7 +1229,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); - smc_ib_modify_qp_reset(lnk); + smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); @@ -1361,7 +1361,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) else tasklet_unlock_wait(&conn->rx_tsklet); } else { - smc_cdc_tx_dismiss_slots(conn); + smc_cdc_wait_pend_tx_wr(conn); } smc_lgr_unregister_conn(conn); smc_close_active_abort(smc); @@ -1484,11 +1484,16 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd) /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. + * + * We must wait here for QPs been destroyed before we destroy the CQs, + * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus + * smc_sock cannot be released. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + LIST_HEAD(lgr_linkdown_list); int i; spin_lock_bh(&smc_lgr_list.lock); @@ -1500,7 +1505,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) - smcr_link_down_cond_sched(&lgr->lnk[i]); + list_move_tail(&lgr->list, &lgr_linkdown_list); } } } @@ -1512,6 +1517,16 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) __smc_lgr_terminate(lgr, false); } + list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) { + mutex_lock(&lgr->llc_conf_mutex); + smcr_link_down_cond(&lgr->lnk[i]); + mutex_unlock(&lgr->llc_conf_mutex); + } + } + } + if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, @@ -1611,7 +1626,6 @@ static void smcr_link_down(struct smc_link *lnk) if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) return; - smc_ib_modify_qp_reset(lnk); to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ smcr_link_clear(lnk, true); @@ -1849,6 +1863,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + init_waitqueue_head(&conn->cdc_pend_tx_wq); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index b414bb12e6ca..519884ecebab 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -109,12 +109,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk) IB_QP_MAX_QP_RD_ATOMIC); } -int smc_ib_modify_qp_reset(struct smc_link *lnk) +int smc_ib_modify_qp_error(struct smc_link *lnk) { struct ib_qp_attr qp_attr; memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.qp_state = IB_QPS_RESET; + qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 07585937370e..bfa1c6bf6313 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -90,6 +90,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); +int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index df1dc225cbab..c6cfdea8b71b 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -62,13 +62,9 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link) } /* wait till all pending tx work requests on the given link are completed */ -int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) { - if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), - SMC_WR_TX_WAIT_PENDING_TIME)) - return 0; - else /* timeout */ - return -EPIPE; + wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); } static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) @@ -87,7 +83,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) struct smc_wr_tx_pend pnd_snd; struct smc_link *link; u32 pnd_snd_idx; - int i; link = wc->qp->qp_context; @@ -128,14 +123,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (wc->status) { - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { - /* clear full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[i], 0, - sizeof(link->wr_tx_pends[i])); - memset(&link->wr_tx_bufs[i], 0, - sizeof(link->wr_tx_bufs[i])); - clear_bit(i, link->wr_tx_mask); - } if (link->lgr->smc_version == SMC_V2) { memset(link->wr_tx_v2_pend, 0, sizeof(*link->wr_tx_v2_pend)); @@ -421,25 +408,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) return rc; } -void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, - smc_wr_tx_filter filter, - smc_wr_tx_dismisser dismisser, - unsigned long data) -{ - struct smc_wr_tx_pend_priv *tx_pend; - struct smc_wr_rx_hdr *wr_tx; - int i; - - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { - wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; - if (wr_tx->type != wr_tx_hdr_type) - continue; - tx_pend = &link->wr_tx_pends[i].priv; - if (filter(tx_pend, data)) - dismisser(tx_pend); - } -} - /****************************** receive queue ********************************/ int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) @@ -675,10 +643,7 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_wakeup_reg_wait(lnk); smc_wr_wakeup_tx_wait(lnk); - if (smc_wr_tx_wait_no_pending_sends(lnk)) - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * - sizeof(*lnk->wr_tx_mask)); + smc_wr_tx_wait_no_pending_sends(lnk); wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 48ed9b08ac7a..47512ccce5ef 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -22,7 +22,6 @@ #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) -#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ @@ -130,7 +129,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, unsigned long data); -int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); +void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); -- Gitee From 7517520bc475d8084027e103c53549e5aaeaf8e9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 30 Dec 2021 11:40:40 +0100 Subject: [PATCH 095/148] net/smc: Use the bitmap API when applicable Using the bitmap API is less verbose than hand writing them. It also improves the semantic. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index c6cfdea8b71b..24be1d03fef9 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -54,11 +54,7 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /* returns true if at least one tx work request is pending on the given link */ static inline bool smc_wr_is_tx_pend(struct smc_link *link) { - if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != - link->wr_tx_cnt) { - return true; - } - return false; + return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt); } /* wait till all pending tx work requests on the given link are completed */ @@ -696,7 +692,7 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); lnk->wr_tx_pends = NULL; - kfree(lnk->wr_tx_mask); + bitmap_free(lnk->wr_tx_mask); lnk->wr_tx_mask = NULL; kfree(lnk->wr_tx_sges); lnk->wr_tx_sges = NULL; @@ -772,9 +768,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT), - sizeof(*link->wr_tx_mask), - GFP_KERNEL); + link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, @@ -887,8 +881,7 @@ int smc_wr_create_link(struct smc_link *lnk) goto dma_unmap; } smc_wr_init_sge(lnk); - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); init_waitqueue_head(&lnk->wr_tx_wait); atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); -- Gitee From 0434f5e13460cd4b7569bafd7e573499d3e0634b Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 28 Dec 2021 21:06:09 +0800 Subject: [PATCH 096/148] net/smc: Introduce net namespace support for linkgroup Currently, rdma device supports exclusive net namespace isolation, however linkgroup doesn't know and support ibdev net namespace. Applications in the containers don't want to share the nics if we enabled rdma exclusive mode. Every net namespaces should have their own linkgroups. This patch introduce a new field net for linkgroup, which is standing for the ibdev net namespace in the linkgroup. The net in linkgroup is initialized with the net namespace of link's ibdev. It compares the net of linkgroup and sock or ibdev before choose it, if no matched, create new one in current net namespace. If rdma net namespace exclusive mode is not enabled, it behaves as before. Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 24 +++++++++++++++++------- net/smc/smc_core.h | 2 ++ net/smc/smc_ib.h | 7 +++++++ net/smc/smc_pnet.c | 21 ++++++++++++++++----- 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3c73a665d110..7c543e49ae11 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -916,6 +916,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_wr_free_lgr_mem(lgr); goto free_wq; } + lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); @@ -1604,7 +1605,8 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || - lgr->type == SMC_LGR_ASYMMETRIC_PEER) + lgr->type == SMC_LGR_ASYMMETRIC_PEER || + !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; /* trigger local add link processing */ @@ -1762,8 +1764,10 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, u8 peer_systemid[], u8 peer_gid[], u8 peer_mac_v1[], - enum smc_lgr_role role, u32 clcqpn) + enum smc_lgr_role role, u32 clcqpn, + struct net *net) { + struct smc_link *lnk; int i; if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || @@ -1771,12 +1775,17 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_active(&lgr->lnk[i])) + lnk = &lgr->lnk[i]; + + if (!smc_link_active(lnk)) continue; - if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && - !memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) && + /* use verbs API to check netns, instead of lgr->net */ + if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) + return false; + if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && + !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && (smcr_version == SMC_V2 || - !memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN))) + !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; @@ -1792,6 +1801,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; @@ -1818,7 +1828,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, - ini->ib_clcqpn)) && + ini->ib_clcqpn, net)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f2a12d79793f..5e8e37ec863b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -307,6 +307,8 @@ struct smc_link_group { u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; + /* net namespace */ + struct net *net; }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index bfa1c6bf6313..5d8b49c57f50 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -69,6 +69,13 @@ static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) return cpu_to_be32(INADDR_NONE); } +static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev) +{ + if (smcibdev && smcibdev->ibdev) + return read_pnet(&smcibdev->ibdev->coredev.rdma_net); + return NULL; +} + struct smc_init_info_smcrv2; struct smc_buf_desc; struct smc_link; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index fb1952478ac8..13df00306182 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -977,14 +977,16 @@ static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, - struct smc_ib_device *known_dev) + struct smc_ib_device *known_dev, + struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - if (ibdev == known_dev) + if (ibdev == known_dev || + !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) @@ -1001,12 +1003,14 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, mutex_unlock(&smc_ib_devices.mutex); } -/* find alternate roce device with same pnet_id and vlan_id */ +/* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { - _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev); + struct net *net = lgr->net; + + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its @@ -1015,6 +1019,7 @@ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { + struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); @@ -1022,6 +1027,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct net_device *ndev; int i; + /* check rdma net namespace */ + if (!rdma_dev_access_netns(ibdev->ibdev, net)) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; @@ -1052,15 +1061,17 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net *net; ndev = pnet_find_base_ndev(ndev); + net = dev_net(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, -- Gitee From a08df6eb2b2efa8f4c7c2350698ad61c2649078c Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 28 Dec 2021 21:44:36 +0800 Subject: [PATCH 097/148] net/smc: Introduce TCP ULP support This implements TCP ULP for SMC, helps applications to replace TCP with SMC protocol in place. And we use it to implement transparent replacement. This replaces original TCP sockets with SMC, reuse TCP as clcsock when calling setsockopt with TCP_ULP option, and without any overhead. To replace TCP sockets with SMC, there are two approaches: - use setsockopt() syscall with TCP_ULP option, if error, it would fallback to TCP. - use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to replace transparently. BPF hooks some points in create socket, bind and others, users can inject their BPF logics without modifying their applications, and choose which connections should be replaced with SMC by calling setsockopt() in BPF prog, based on rules, such as TCP tuples, PID, cgroup, etc... BPF doesn't support calling setsockopt with TCP_ULP now, I will send the patches after this accepted. Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 93 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 86 insertions(+), 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c1583df1cd3f..4ef0f71c7548 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2700,8 +2700,8 @@ static const struct proto_ops smc_sock_ops = { .splice_read = smc_splice_read, }; -static int smc_create(struct net *net, struct socket *sock, int protocol, - int kern) +static int __smc_create(struct net *net, struct socket *sock, int protocol, + int kern, struct socket *clcsock) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; @@ -2726,12 +2726,19 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; - rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, - &smc->clcsock); - if (rc) { - sk_common_release(sk); - goto out; + + rc = 0; + if (!clcsock) { + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) { + sk_common_release(sk); + goto out; + } + } else { + smc->clcsock = clcsock; } + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); @@ -2739,12 +2746,76 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, return rc; } +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + return __smc_create(net, sock, protocol, kern, NULL); +} + static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; +static int smc_ulp_init(struct sock *sk) +{ + struct socket *tcp = sk->sk_socket; + struct net *net = sock_net(sk); + struct socket *smcsock; + int protocol, ret; + + /* only TCP can be replaced */ + if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || + (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) + return -ESOCKTNOSUPPORT; + /* don't handle wq now */ + if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) + return -ENOTCONN; + + if (sk->sk_family == AF_INET) + protocol = SMCPROTO_SMC; + else + protocol = SMCPROTO_SMC6; + + smcsock = sock_alloc(); + if (!smcsock) + return -ENFILE; + + smcsock->type = SOCK_STREAM; + __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ + ret = __smc_create(net, smcsock, protocol, 1, tcp); + if (ret) { + sock_release(smcsock); /* module_put() which ops won't be NULL */ + return ret; + } + + /* replace tcp socket to smc */ + smcsock->file = tcp->file; + smcsock->file->private_data = smcsock; + smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ + smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ + tcp->file = NULL; + + return ret; +} + +static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + /* don't inherit ulp ops to child when listen */ + icsk->icsk_ulp_ops = NULL; +} + +static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { + .name = "smc", + .owner = THIS_MODULE, + .init = smc_ulp_init, + .clone = smc_ulp_clone, +}; + unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) @@ -2855,6 +2926,12 @@ static int __init smc_init(void) goto out_sock; } + rc = tcp_register_ulp(&smc_ulp_ops); + if (rc) { + pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + goto out_sock; + } + static_branch_enable(&tcp_have_smc); return 0; @@ -2883,6 +2960,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); @@ -2905,3 +2983,4 @@ MODULE_AUTHOR("Ursula Braun "); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); +MODULE_ALIAS_TCP_ULP("smc"); -- Gitee From b6a7c8479a66f3fe547e94d1c50c67c41aa17b23 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 30 Dec 2021 15:39:00 +0000 Subject: [PATCH 098/148] net/smc: remove redundant re-assignment of pointer link The pointer link is being re-assigned the same value that it was initialized with in the previous declaration statement. The re-assignment is redundant and can be removed. Fixes: 387707fdf486 ("net/smc: convert static link ID to dynamic references") Signed-off-by: Colin Ian King Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 8409ab71a5e4..6be95a2a7b25 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1021,7 +1021,6 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, struct smc_link *link = conn->lnk; /* SMC-R specific settings */ - link = conn->lnk; memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_R; -- Gitee From 14038da46939bfaf52a8c9adedb4da10824a112f Mon Sep 17 00:00:00 2001 From: Dust Li Date: Fri, 31 Dec 2021 14:08:53 +0800 Subject: [PATCH 099/148] net/smc: add comments for smc_link_{usable|sendable} Add comments for both smc_link_sendable() and smc_link_usable() to help better distinguish and use them. No function changes. Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_core.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5e8e37ec863b..ebca96fe3a2b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -410,7 +410,13 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } -/* returns true if the specified link is usable */ +/* + * Returns true if the specified link is usable. + * + * usable means the link is ready to receive RDMA messages, map memory + * on the link, etc. This doesn't ensure we are able to send RDMA messages + * on this link, if sending RDMA messages is needed, use smc_link_sendable() + */ static inline bool smc_link_usable(struct smc_link *lnk) { if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) @@ -418,6 +424,15 @@ static inline bool smc_link_usable(struct smc_link *lnk) return true; } +/* + * Returns true if the specified link is ready to receive AND send RDMA + * messages. + * + * For the client side in first contact, the underlying QP may still in + * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() + * is not strong enough. For those places that need to send any CDC or LLC + * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead + */ static inline bool smc_link_sendable(struct smc_link *lnk) { return smc_link_usable(lnk) && -- Gitee From a559c8a8bb3a334b178a54d92d208f70af8de01b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 6 Jan 2022 20:42:08 +0800 Subject: [PATCH 100/148] net/smc: Reset conn->lgr when link group registration fails SMC connections might fail to be registered in a link group due to unable to find a usable link during its creation. As a result, smc_conn_create() will return a failure and most resources related to the connection won't be applied or initialized, such as conn->abort_work or conn->lnk. If smc_conn_free() is invoked later, it will try to access the uninitialized resources related to the connection, thus causing a warning or crash. This patch tries to fix this by resetting conn->lgr to NULL if an abnormal exit occurs in smc_lgr_register_conn(), thus avoiding the access to uninitialized resources in smc_conn_free(). Meanwhile, the new created link group should be terminated if smc connections can't be registered in it. So smc_lgr_cleanup_early() is modified to take care of link group only and invoked to terminate unusable link group by smc_conn_create(). The call to smc_conn_free() is moved out from smc_lgr_cleanup_early() to smc_conn_abort(). Fixes: 56bc3b2094b4 ("net/smc: assign link to a new connection") Suggested-by: Karsten Graul Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 8 +++++--- net/smc/smc_core.c | 12 +++++++----- net/smc/smc_core.h | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4ef0f71c7548..6f4c39aec069 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -632,10 +632,12 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, static void smc_conn_abort(struct smc_sock *smc, int local_first) { + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + + smc_conn_free(conn); if (local_first) - smc_lgr_cleanup_early(&smc->conn); - else - smc_conn_free(&smc->conn); + smc_lgr_cleanup_early(lgr); } /* check if there is a rdma device available for this connection. */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7c543e49ae11..b36b0c2508b0 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -171,8 +171,10 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); - if (rc) + if (rc) { + conn->lgr = NULL; return rc; + } } /* find a new alert_token_local value not yet used by some connection * in this link group @@ -622,15 +624,13 @@ int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void smc_lgr_cleanup_early(struct smc_connection *conn) +void smc_lgr_cleanup_early(struct smc_link_group *lgr) { - struct smc_link_group *lgr = conn->lgr; spinlock_t *lgr_lock; if (!lgr) return; - smc_conn_free(conn); smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ @@ -1867,8 +1867,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); - if (rc) + if (rc) { + smc_lgr_cleanup_early(lgr); goto out; + } } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index ebca96fe3a2b..e248a2d3672b 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -486,7 +486,7 @@ static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_sock; struct smc_clc_msg_accept_confirm; -void smc_lgr_cleanup_early(struct smc_connection *conn); +void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); -- Gitee From 80dbe57cc57c93f694ab3b08d707a8c750c9557e Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 13 Jan 2022 16:36:40 +0800 Subject: [PATCH 101/148] net/smc: Resolve the race between link group access and termination We encountered some crashes caused by the race between the access and the termination of link groups. Here are some of panic stacks we met: 1) Race between smc_clc_wait_msg() and __smc_lgr_terminate() BUG: kernel NULL pointer dereference, address: 00000000000002f0 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_wait_msg+0x3eb/0x5c0 [smc] Call Trace: ? smc_clc_send_accept+0x45/0xa0 [smc] ? smc_clc_send_accept+0x45/0xa0 [smc] smc_listen_work+0x783/0x1220 [smc] ? finish_task_switch+0xc4/0x2e0 ? process_one_work+0x1ad/0x3c0 process_one_work+0x1ad/0x3c0 worker_thread+0x4c/0x390 ? rescuer_thread+0x320/0x320 kthread+0x149/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 smc_listen_work() abnormal case like port error --------------------------------------------------------------- | __smc_lgr_terminate() | |- smc_conn_kill() | |- smc_lgr_unregister_conn() | |- set conn->lgr = NULL smc_clc_wait_msg() | |- access conn->lgr (panic) | 2) Race between smc_setsockopt() and __smc_lgr_terminate() BUG: kernel NULL pointer dereference, address: 00000000000002e8 RIP: 0010:smc_setsockopt+0x17a/0x280 [smc] Call Trace: __sys_setsockopt+0xfc/0x190 __x64_sys_setsockopt+0x20/0x30 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae smc_setsockopt() abnormal case like port error -------------------------------------------------------------- | __smc_lgr_terminate() | |- smc_conn_kill() | |- smc_lgr_unregister_conn() | |- set conn->lgr = NULL mod_delayed_work() | |- access conn->lgr (panic) | There are some other panic places and they are caused by the similar reason as described above, which is accessing link group after termination, thus getting a NULL pointer or invalid resource. Currently, there seems to be no synchronization between the link group access and a sudden termination of it. This patch tries to fix this by introducing reference count of link group and not freeing link group until reference count is zero. Link group might be referred to by links or smc connections. So the operation to the link group reference count can be concluded as follows: object [hold or initialized as 1] [put] ------------------------------------------------------------------- link group smc_lgr_create() smc_lgr_free() connections smc_conn_create() smc_conn_free() links smcr_link_init() smcr_link_clear() Througth this way, we extend the life cycle of link group and ensure it is longer than the life cycle of connections and links above it, so that avoid invalid access to link group after its termination. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc.h | 1 + net/smc/smc_core.c | 60 +++++++++++++++++++++++++++++++++++++--------- net/smc/smc_core.h | 3 +++ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 1a4fc1c6c4ab..3d0b8e300deb 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -221,6 +221,7 @@ struct smc_connection { */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index b36b0c2508b0..ea63087280f4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -218,7 +218,6 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - conn->lgr = NULL; } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) @@ -766,6 +765,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; + smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); @@ -820,6 +820,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold above */ return rc; } @@ -858,6 +859,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; + refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ mutex_init(&lgr->sndbufs_lock); mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -1146,8 +1148,19 @@ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!lgr || conn->freed) + /* Connection has never been registered in a + * link group, or has already been freed. + */ return; + + conn->freed = 1; + if (!conn->alert_token_local) + /* Connection has already unregistered from + * link group. + */ + goto lgr_put; + if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); @@ -1164,6 +1177,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); +lgr_put: + smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ @@ -1222,9 +1237,10 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { + struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) + if (!lgr || lnk->state == SMC_LNK_UNUSED) return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); @@ -1242,6 +1258,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -1306,6 +1323,21 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) __smc_lgr_free_bufs(lgr, true); } +/* won't be freed until no one accesses to lgr anymore */ +static void __smc_lgr_free(struct smc_link_group *lgr) +{ + smc_lgr_free_bufs(lgr); + if (lgr->is_smcd) { + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { + smc_wr_free_lgr_mem(lgr); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } + kfree(lgr); +} + /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { @@ -1321,19 +1353,23 @@ static void smc_lgr_free(struct smc_link_group *lgr) smc_llc_lgr_clear(lgr); } - smc_lgr_free_bufs(lgr); destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); - if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) - wake_up(&lgr->smcd->lgrs_deleted); - } else { - smc_wr_free_lgr_mem(lgr); - if (!atomic_dec_return(&lgr_cnt)) - wake_up(&lgrs_deleted); } - kfree(lgr); + smc_lgr_put(lgr); /* theoretically last lgr_put */ +} + +void smc_lgr_hold(struct smc_link_group *lgr) +{ + refcount_inc(&lgr->refcnt); +} + +void smc_lgr_put(struct smc_link_group *lgr) +{ + if (refcount_dec_and_test(&lgr->refcnt)) + __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) @@ -1872,6 +1908,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) goto out; } } + smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e248a2d3672b..fb5661d7a771 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -250,6 +250,7 @@ struct smc_link_group { u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ + refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -488,6 +489,8 @@ struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); +void smc_lgr_hold(struct smc_link_group *lgr); +void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, -- Gitee From 15b90e4e1926c96b520a54381e5e3f51b69616e8 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 13 Jan 2022 16:36:41 +0800 Subject: [PATCH 102/148] net/smc: Introduce a new conn->lgr validity check helper It is no longer suitable to identify whether a smc connection is registered in a link group through checking if conn->lgr is NULL, because conn->lgr won't be reset even the connection is unregistered from a link group. So this patch introduces a new helper smc_conn_lgr_valid() and replaces all the check of conn->lgr in original implementation with the new helper to judge if conn->lgr is valid to use. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 +++++- net/smc/smc_cdc.c | 3 ++- net/smc/smc_clc.c | 2 +- net/smc/smc_core.c | 14 ++++++++------ net/smc/smc_core.h | 5 +++++ net/smc/smc_diag.c | 6 +++--- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6f4c39aec069..38a8af325f4a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -634,9 +634,13 @@ static void smc_conn_abort(struct smc_sock *smc, int local_first) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; + bool lgr_valid = false; + + if (smc_conn_lgr_valid(conn)) + lgr_valid = true; smc_conn_free(conn); - if (local_first) + if (local_first && lgr_valid) smc_lgr_cleanup_early(lgr); } diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 84c8a4374fdd..9d5a97168969 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -197,7 +197,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; - if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + if (!smc_conn_lgr_valid(conn) || + (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) return -EPIPE; if (conn->lgr->is_smcd) { diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 6be95a2a7b25..ce27399b38b1 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,7 +774,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? SMC_FIRST_CONTACT_MASK : 0; - if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) && + if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ea63087280f4..771dafaccca6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -211,7 +211,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { @@ -1155,7 +1155,7 @@ void smc_conn_free(struct smc_connection *conn) return; conn->freed = 1; - if (!conn->alert_token_local) + if (!smc_conn_lgr_valid(conn)) /* Connection has already unregistered from * link group. */ @@ -2294,14 +2294,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -2310,7 +2312,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) @@ -2324,7 +2326,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fb5661d7a771..8b4ed82785b9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -411,6 +411,11 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline bool smc_conn_lgr_valid(struct smc_connection *conn) +{ + return conn->lgr && conn->alert_token_local; +} + /* * Returns true if the specified link is usable. * diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index c952986a6aca..25ef26b621a2 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -89,7 +89,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; - else if (smc->conn.lgr && smc->conn.lgr->is_smcd) + else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; @@ -142,7 +142,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { @@ -162,7 +162,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } - if (smc->conn.lgr && smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; -- Gitee From 53253ea049b39ebc1409239a21495130e2ca4546 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 13 Jan 2022 16:36:42 +0800 Subject: [PATCH 103/148] net/smc: Resolve the race between SMC-R link access and clear We encountered some crashes caused by the race between SMC-R link access and link clear that triggered by abnormal link group termination, such as port error. Here is an example of this kind of crashes: BUG: kernel NULL pointer dereference, address: 0000000000000000 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_llc_flow_initiate+0x44/0x190 [smc] Call Trace: ? __smc_buf_create+0x75a/0x950 [smc] smcr_lgr_reg_rmbs+0x2a/0xbf [smc] smc_listen_work+0xf72/0x1230 [smc] ? process_one_work+0x25c/0x600 process_one_work+0x25c/0x600 worker_thread+0x4f/0x3a0 ? process_one_work+0x600/0x600 kthread+0x15d/0x1a0 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 smc_listen_work() __smc_lgr_terminate() --------------------------------------------------------------- | smc_lgr_free() | |- smcr_link_clear() | |- memset(lnk, 0) smc_listen_rdma_reg() | |- smcr_lgr_reg_rmbs() | |- smc_llc_flow_initiate() | |- access lnk->lgr (panic) | These crashes are similarly caused by clearing SMC-R link resources when some functions is still accessing to them. This patch tries to fix the issue by introducing reference count of SMC-R links and ensuring that the sensitive resources of links won't be cleared until reference count reaches zero. The operation to the SMC-R link reference count can be concluded as follows: object [hold or initialized as 1] [put] -------------------------------------------------------------------- links smcr_link_init() smcr_link_clear() connections smc_conn_create() smc_conn_free() Through this way, the clear of SMC-R links is later than the free of all the smc connections above it, thus avoiding the unsafe reference to SMC-R links. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 52 +++++++++++++++++++++++++++++++++++----------- net/smc/smc_core.h | 4 ++++ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 771dafaccca6..381752302df2 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -762,6 +762,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); + refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ + lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; @@ -1014,8 +1016,12 @@ void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); + /* link_hold in smc_conn_create() */ + smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); + /* link_put in smc_conn_free() */ + smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, @@ -1178,6 +1184,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: + if (!lgr->is_smcd) + smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } @@ -1234,22 +1242,11 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } -/* must be called under lgr->llc_conf_mutex lock */ -void smcr_link_clear(struct smc_link *lnk, bool log) +static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - if (!lgr || lnk->state == SMC_LNK_UNUSED) - return; - lnk->peer_qpn = 0; - smc_llc_link_clear(lnk, log); - smcr_buf_unmap_lgr(lnk); - smcr_rtoken_clear_link(lnk); - smc_ib_modify_qp_error(lnk); - smc_wr_free_link(lnk); - smc_ib_destroy_queue_pair(lnk); - smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); @@ -1261,6 +1258,35 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) +{ + if (!lnk->lgr || lnk->clearing || + lnk->state == SMC_LNK_UNUSED) + return; + lnk->clearing = 1; + lnk->peer_qpn = 0; + smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); + smc_ib_modify_qp_error(lnk); + smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); + smcr_link_put(lnk); /* theoretically last link_put */ +} + +void smcr_link_hold(struct smc_link *lnk) +{ + refcount_inc(&lnk->refcnt); +} + +void smcr_link_put(struct smc_link *lnk) +{ + if (refcount_dec_and_test(&lnk->refcnt)) + __smcr_link_clear(lnk); +} + static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { @@ -1909,6 +1935,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } } smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + if (!conn->lgr->is_smcd) + smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8b4ed82785b9..35a85ec08919 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -138,6 +138,8 @@ struct smc_link { u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ + u8 clearing : 1; /* link is being cleared */ + refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ @@ -527,6 +529,8 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); +void smcr_link_hold(struct smc_link *lnk); +void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); -- Gitee From a590bd4a7ac013a5aba40d5519af6cd123f97714 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 14 Jan 2022 21:35:45 +0800 Subject: [PATCH 104/148] net/smc: Remove unused function declaration The declaration of smc_wr_tx_dismiss_slots() is unused. So remove it. Fixes: 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock") Signed-off-by: Wen Gu Reviewed-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_wr.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 47512ccce5ef..a54e90a1110f 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -125,10 +125,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); -void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, - smc_wr_tx_filter filter, - smc_wr_tx_dismisser dismisser, - unsigned long data); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); -- Gitee From fb1b5da9758872c48860307dae446b632009c017 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sun, 16 Jan 2022 15:43:42 +0800 Subject: [PATCH 105/148] net/smc: Fix hung_task when removing SMC-R devices A hung_task is observed when removing SMC-R devices. Suppose that a link group has two active links(lnk_A, lnk_B) associated with two different SMC-R devices(dev_A, dev_B). When dev_A is removed, the link group will be removed from smc_lgr_list and added into lgr_linkdown_list. lnk_A will be cleared and smcibdev(A)->lnk_cnt will reach to zero. However, when dev_B is removed then, the link group can't be found in smc_lgr_list and lnk_B won't be cleared, making smcibdev->lnk_cnt never reaches zero, which causes a hung_task. This patch fixes this issue by restoring the implementation of smc_smcr_terminate_all() to what it was before commit 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock"). The original implementation also satisfies the intention that make sure QP destroy earlier than CQ destroy because we will always wait for smcibdev->lnk_cnt reaches zero, which guarantees QP has been destroyed. Fixes: 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock") Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 381752302df2..aa4f16718fa1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1547,16 +1547,11 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd) /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. - * - * We must wait here for QPs been destroyed before we destroy the CQs, - * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus - * smc_sock cannot be released. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); - LIST_HEAD(lgr_linkdown_list); int i; spin_lock_bh(&smc_lgr_list.lock); @@ -1568,7 +1563,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) - list_move_tail(&lgr->list, &lgr_linkdown_list); + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -1580,16 +1575,6 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) __smc_lgr_terminate(lgr, false); } - list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].smcibdev == smcibdev) { - mutex_lock(&lgr->llc_conf_mutex); - smcr_link_down_cond(&lgr->lnk[i]); - mutex_unlock(&lgr->llc_conf_mutex); - } - } - } - if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, -- Gitee From fb53765160b2277db43bb05f1a302eff98272619 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sat, 22 Jan 2022 17:43:09 +0800 Subject: [PATCH 106/148] net/smc: Transitional solution for clcsock race issue We encountered a crash in smc_setsockopt() and it is caused by accessing smc->clcsock after clcsock was released. BUG: kernel NULL pointer dereference, address: 0000000000000020 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 50309 Comm: nginx Kdump: loaded Tainted: G E 5.16.0-rc4+ #53 RIP: 0010:smc_setsockopt+0x59/0x280 [smc] Call Trace: __sys_setsockopt+0xfc/0x190 __x64_sys_setsockopt+0x20/0x30 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f16ba83918e This patch tries to fix it by holding clcsock_release_lock and checking whether clcsock has already been released before access. In case that a crash of the same reason happens in smc_getsockopt() or smc_switch_to_fallback(), this patch also checkes smc->clcsock in them too. And the caller of smc_switch_to_fallback() will identify whether fallback succeeds according to the return value. Fixes: fd57770dd198 ("net/smc: wait for pending work before clcsock release_sock") Link: https://lore.kernel.org/lkml/5dd7ffd1-28e2-24cc-9442-1defec27375e@linux.ibm.com/T/ Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 63 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 38a8af325f4a..3ec44f7cdb96 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -566,12 +566,17 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } -static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + wait_queue_head_t *clc_wait; unsigned long flags; + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -586,18 +591,30 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * smc socket->wq, which should be removed * to clcsocket->wq during the fallback. */ + clc_wait = sk_sleep(smc->clcsock->sk); spin_lock_irqsave(&smc_wait->lock, flags); spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); } + mutex_unlock(&smc->clcsock_release_lock); + return 0; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc, reason_code); + struct net *net = sock_net(&smc->sk); + int rc = 0; + + rc = smc_switch_to_fallback(smc, reason_code); + if (rc) { /* fallback fails */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -1518,11 +1535,12 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); - if (reason_code < 0) { /* error, no fallback possible */ + if (reason_code < 0 || + smc_switch_to_fallback(new_smc, reason_code)) { + /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1964,8 +1982,11 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - smc_listen_out_connected(new_smc); + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); return; } @@ -2254,7 +2275,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + if (rc) + goto out; } else { rc = -EINVAL; goto out; @@ -2447,6 +2470,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2456,6 +2484,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2472,7 +2501,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -2515,13 +2544,23 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; + int rc; smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } /* socket options apply to the CLC socket */ - if (unlikely(!smc->clcsock->ops->getsockopt)) + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; - return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, - optval, optlen); + } + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, -- Gitee From fc462b3fc46a2c19b13f8b62e48e334f0109bc9f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 26 Jan 2022 23:33:04 +0800 Subject: [PATCH 107/148] net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 133 +++++++++++++++++++++++++++++++++++++++++------ net/smc/smc.h | 20 ++++++- 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3ec44f7cdb96..0949788640fe 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -566,17 +566,115 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); + + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); +} + static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait; - unsigned long flags; + struct sock *clcsk; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } + clcsk = smc->clcsock->sk; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -587,16 +685,22 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. */ - clc_wait = sk_sleep(smc->clcsock->sk); - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_fback_state_change; + clcsk->sk_data_ready = smc_fback_data_ready; + clcsk->sk_write_space = smc_fback_write_space; + clcsk->sk_error_report = smc_fback_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } mutex_unlock(&smc->clcsock_release_lock); return 0; @@ -2115,10 +2219,9 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc; + struct smc_sock *lsmc = + smc_clcsock_user_data(listen_clcsock); - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); if (!lsmc) return; lsmc->clcsk_data_ready(listen_clcsock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 3d0b8e300deb..37b2001a0255 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -139,6 +139,12 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -228,8 +234,14 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. **/ + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -264,6 +276,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ -- Gitee From 6484b8e4865f9f70aac16ba2e4af1316f2b1e6cd Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 31 Jan 2022 02:02:55 +0800 Subject: [PATCH 108/148] net/smc: Send directly when TCP_CORK is cleared According to the man page of TCP_CORK [1], if set, don't send out partial frames. All queued partial frames are sent when option is cleared again. When applications call setsockopt to disable TCP_CORK, this call is protected by lock_sock(), and tries to mod_delayed_work() to 0, in order to send pending data right now. However, the delayed work smc_tx_work is also protected by lock_sock(). There introduces lock contention for sending data. To fix it, send pending data directly which acts like TCP, without lock_sock() protected in the context of setsockopt (already lock_sock()ed), and cancel unnecessary dealyed work, which is protected by lock. [1] https://linux.die.net/man/7/tcp Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- net/smc/smc_tx.c | 25 +++++++++++++++---------- net/smc/smc_tx.h | 1 + 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0949788640fe..10261fa982b8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2626,8 +2626,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_CLOSED) { if (!val) { SMC_STAT_INC(smc, cork_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); } } break; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 02d147bde78c..d8e09c8bf88c 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -598,27 +598,32 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -/* Wakeup sndbuf consumers from process context - * since there is more data to transmit - */ -void smc_tx_work(struct work_struct *work) +void smc_tx_pending(struct smc_connection *conn) { - struct smc_connection *conn = container_of(to_delayed_work(work), - struct smc_connection, - tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; - lock_sock(&smc->sk); if (smc->sk.sk_err) - goto out; + return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} + +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit + */ +void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(to_delayed_work(work), + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); -out: + lock_sock(&smc->sk); + smc_tx_pending(conn); release_sock(&smc->sk); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 07e6ad76224a..a59f370b8b43 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -27,6 +27,7 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); -- Gitee From 0f6eb23793523ab0c30e666dfa645a63afcd1863 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 31 Jan 2022 02:02:56 +0800 Subject: [PATCH 109/148] net/smc: Remove corked dealyed work Based on the manual of TCP_CORK [1] and MSG_MORE [2], these two options have the same effect. Applications can set these options and informs the kernel to pend the data, and send them out only when the socket or syscall does not specify this flag. In other words, there's no need to send data out by a delayed work, which will queue a lot of work. This removes corked delayed work with SMC_TX_CORK_DELAY (250ms), and the applications control how/when to send them out. It improves the performance for sendfile and throughput, and remove unnecessary race of lock_sock(). This also unlocks the limitation of sndbuf, and try to fill it up before sending. [1] https://linux.die.net/man/7/tcp [2] https://man7.org/linux/man-pages/man2/send.2.html Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index d8e09c8bf88c..393f23d5ce9b 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -31,7 +31,6 @@ #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 -#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -238,15 +237,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) - /* for a corked socket defer the RDMA writes if there - * is still sufficient sndbuf_space available + (atomic_read(&conn->sndbuf_space))) + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. */ - queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, - SMC_TX_CORK_DELAY); - else - smc_tx_sndbuf_nonempty(conn); + continue; + smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From 9be755519a4d9cccda62a5028ea903eace64b24d Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 31 Jan 2022 02:02:57 +0800 Subject: [PATCH 110/148] net/smc: Cork when sendpage with MSG_SENDPAGE_NOTLAST flag This introduces a new corked flag, MSG_SENDPAGE_NOTLAST, which is involved in syscall sendfile() [1], it indicates this is not the last page. So we can cork the data until the page is not specify this flag. It has the same effect as MSG_MORE, but existed in sendfile() only. This patch adds a option MSG_SENDPAGE_NOTLAST for corking data, try to cork more data before sending when using sendfile(), which acts like TCP's behaviour. Also, this reimplements the default sendpage to inform that it is supported to some extent. [1] https://man7.org/linux/man-pages/man2/sendfile.2.html Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- net/smc/smc_tx.c | 19 ++++++++++++++++++- net/smc/smc_tx.h | 2 ++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 10261fa982b8..819585d57f72 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2765,8 +2765,10 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); } else { + lock_sock(sk); + rc = smc_tx_sendpage(smc, page, offset, size, flags); + release_sock(sk); SMC_STAT_INC(smc, sendpage_cnt); - rc = sock_no_sendpage(sock, page, offset, size, flags); } out: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 393f23d5ce9b..75f7d770dec8 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -236,7 +236,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && (atomic_read(&conn->sndbuf_space))) /* for a corked socket defer the RDMA writes if * sndbuf_space is still available. The applications @@ -258,6 +259,22 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return rc; } +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags) +{ + struct msghdr msg = {.msg_flags = flags}; + char *kaddr = kmap(page); + struct kvec iov; + int rc; + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); + rc = smc_tx_sendmsg(smc, &msg, size); + kunmap(page); + return rc; +} + /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index a59f370b8b43..34b578498b1f 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -31,6 +31,8 @@ void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); -- Gitee From 7208d820136a4e56cf4b4c376b1b07443e308817 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 9 Feb 2022 22:10:53 +0800 Subject: [PATCH 111/148] net/smc: Avoid overwriting the copies of clcsock callback functions The callback functions of clcsock will be saved and replaced during the fallback. But if the fallback happens more than once, then the copies of these callback functions will be overwritten incorrectly, resulting in a loop call issue: clcsk->sk_error_report |- smc_fback_error_report() <------------------------------| |- smc_fback_forward_wakeup() | (loop) |- clcsock_callback() (incorrectly overwritten) | |- smc->clcsk_error_report() ------------------| So this patch fixes the issue by saving these function pointers only once in the fallback and avoiding overwriting. Reported-by: syzbot+4de3c0e8a263e1e499bc@syzkaller.appspotmail.com Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Link: https://lore.kernel.org/r/0000000000006d045e05d78776f6@google.com Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 819585d57f72..de26d0ced3cd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -667,14 +667,17 @@ static void smc_fback_error_report(struct sock *clcsk) static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { struct sock *clcsk; + int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; + rc = -EBADF; + goto out; } clcsk = smc->clcsock->sk; + if (smc->use_fallback) + goto out; smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -702,8 +705,9 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } +out: mutex_unlock(&smc->clcsock_release_lock); - return 0; + return rc; } /* fall back during connect */ -- Gitee From 362b6f1e2ecb45b2ee51cc87deae9679c4134a83 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:34 +0800 Subject: [PATCH 112/148] net/smc: Make smc_tcp_listen_work() independent In multithread and 10K connections benchmark, the backend TCP connection established very slowly, and lots of TCP connections stay in SYN_SENT state. Client: smc_run wrk -c 10000 -t 4 http://server the netstate of server host shows like: 145042 times the listen queue of a socket overflowed 145042 SYNs to LISTEN sockets dropped One reason of this issue is that, since the smc_tcp_listen_work() shared the same workqueue (smc_hs_wq) with smc_listen_work(), while the smc_listen_work() do blocking wait for smc connection established. Once the workqueue became congested, it's will block the accept() from TCP listen. This patch creates a independent workqueue(smc_tcp_ls_wq) for smc_tcp_listen_work(), separate it from smc_listen_work(), which is quite acceptable considering that smc_tcp_listen_work() runs very fast. Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index de26d0ced3cd..d2209d01e336 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -59,6 +59,7 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ +static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ @@ -2231,7 +2232,7 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } } @@ -3028,9 +3029,14 @@ static int __init smc_init(void) goto out_nl; rc = -ENOMEM; + + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + if (!smc_tcp_ls_wq) + goto out_pnet; + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) - goto out_pnet; + goto out_alloc_tcp_ls_wq; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) @@ -3101,6 +3107,8 @@ static int __init smc_init(void) destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); +out_alloc_tcp_ls_wq: + destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: @@ -3119,6 +3127,7 @@ static void __exit smc_exit(void) smc_core_exit(); smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); -- Gitee From 4ac047c22bb27f8be8ffca03d354e58a5eee7ee8 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:35 +0800 Subject: [PATCH 113/148] net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc.h | 6 +++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d2209d01e336..53cd8bb89dad 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -73,6 +73,36 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } +static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > + sk->sk_max_ack_backlog) + goto drop; + + if (sk_acceptq_is_full(&smc->sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + /* passthrough to original syn recv sock fct */ + return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, + own_req); + +drop: + dst_release(dst); + tcp_listendrop(sk); + return NULL; +} + static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -1599,6 +1629,9 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_dec(&lsmc->queued_smc_hs); + if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -2204,6 +2237,9 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_inc(&lsmc->queued_smc_hs); + new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2270,6 +2306,15 @@ static int smc_listen(struct socket *sock, int backlog) smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + /* save original ops */ + smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; + + smc->af_ops = *smc->ori_af_ops; + smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; + + inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + rc = kernel_listen(smc->clcsock, backlog); if (rc) { smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; diff --git a/net/smc/smc.h b/net/smc/smc.h index 37b2001a0255..e91e40040d07 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -252,6 +252,10 @@ struct smc_sock { /* smc sock container */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ + atomic_t queued_smc_hs; /* queued smc handshakes */ + struct inet_connection_sock_af_ops af_ops; + const struct inet_connection_sock_af_ops *ori_af_ops; + /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value @@ -276,7 +280,7 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); -- Gitee From 373a3b5239dd029b333d168b7f29ceb456884323 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:36 +0800 Subject: [PATCH 114/148] net/smc: Limit SMC visits when handshake workqueue congested This patch intends to provide a mechanism to put constraint on SMC connections visit according to the pressure of SMC handshake process. At present, frequent visits will cause the incoming connections to be backlogged in SMC handshake queue, raise the connections established time. Which is quite unacceptable for those applications who base on short lived connections. There are two ways to implement this mechanism: 1. Put limitation after TCP established. 2. Put limitation before TCP established. In the first way, we need to wait and receive CLC messages that the client will potentially send, and then actively reply with a decline message, in a sense, which is also a sort of SMC handshake, affect the connections established time on its way. In the second way, the only problem is that we need to inject SMC logic into TCP when it is about to reply the incoming SYN, since we already do that, it's seems not a problem anymore. And advantage is obvious, few additional processes are required to complete the constraint. This patch use the second way. After this patch, connections who beyond constraint will not informed any SMC indication, and SMC will not be involved in any of its subsequent processes. Link: https://lore.kernel.org/all/1641301961-59331-1-git-send-email-alibuda@linux.alibaba.com/ Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 3 ++- net/smc/af_smc.c | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2f87377e9af7..66177c5e27c9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -394,6 +394,7 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) + bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b71bdda39991..6c7a8d0bf4fe 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6645,7 +6645,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok; + ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && + tcp_sk(sk)->smc_hs_congested(sk)); #endif } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 53cd8bb89dad..805789a1589c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -103,6 +103,21 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, return NULL; } +static bool smc_hs_congested(const struct sock *sk) +{ + const struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (!smc) + return true; + + if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + return true; + + return false; +} + static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -2315,6 +2330,8 @@ static int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + rc = kernel_listen(smc->clcsock, backlog); if (rc) { smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; -- Gitee From 56b1c897137a3fec43e8e5959f8a91633e910167 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:37 +0800 Subject: [PATCH 115/148] net/smc: Dynamic control handshake limitation by socket options This patch aims to add dynamic control for SMC handshake limitation for every smc sockets, in production environment, it is possible for the same applications to handle different service types, and may have different opinion on SMC handshake limitation. This patch try socket options to complete it, since we don't have socket option level for SMC yet, which requires us to implement it at the same time. This patch does the following: - add new socket option level: SOL_SMC. - add new SMC socket option: SMC_LIMIT_HS. - provide getter/setter for SMC socket options. Link: https://lore.kernel.org/all/20f504f961e1a803f85d64229ad84260434203bd.1644323503.git.alibuda@linux.alibaba.com/ Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + include/uapi/linux/smc.h | 4 +++ net/smc/af_smc.c | 69 +++++++++++++++++++++++++++++++++++++++- net/smc/smc.h | 1 + 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 9aa530d497da..4005895fe296 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -360,6 +360,7 @@ struct ucred { #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 +#define SOL_SMC 286 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 20f33b27787f..631087d32669 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -282,4 +282,8 @@ enum { __SMC_NLA_SEID_TABLE_MAX, SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; + +/* SMC socket options */ +#define SMC_LIMIT_HS 1 /* constraint on smc handshake */ + #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 805789a1589c..285451db2fd1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2330,7 +2330,8 @@ static int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; - tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + if (smc->limit_smc_hs) + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; rc = kernel_listen(smc->clcsock, backlog); if (rc) { @@ -2625,6 +2626,67 @@ static int smc_shutdown(struct socket *sock, int how) return rc ? rc : rc1; } +static int __smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int val, len; + + smc = smc_sk(sock->sk); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case SMC_LIMIT_HS: + val = smc->limit_smc_hs; + break; + default: + return -EOPNOTSUPP; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int __smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + smc = smc_sk(sk); + + lock_sock(sk); + switch (optname) { + case SMC_LIMIT_HS: + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + + smc->limit_smc_hs = !!val; + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + release_sock(sk); + + return rc; +} + static int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -2634,6 +2696,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; + else if (level == SOL_SMC) + return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); @@ -2716,6 +2780,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, struct smc_sock *smc; int rc; + if (level == SOL_SMC) + return __smc_getsockopt(sock, level, optname, optval, optlen); + smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { diff --git a/net/smc/smc.h b/net/smc/smc.h index e91e40040d07..7e2693832a1b 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -249,6 +249,7 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ -- Gitee From c984f8fc9b7c72c16a8afa95bcfc29f5c8556873 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:38 +0800 Subject: [PATCH 116/148] net/smc: Add global configure for handshake limitation by netlink Although we can control SMC handshake limitation through socket options, which means that applications who need it must modify their code. It's quite troublesome for many existing applications. This patch modifies the global default value of SMC handshake limitation through netlink, providing a way to put constraint on handshake without modifies any code for applications. Suggested-by: Tony Lu Signed-off-by: D. Wythe Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- include/net/netns/smc.h | 2 ++ include/uapi/linux/smc.h | 11 +++++++++++ net/smc/af_smc.c | 42 ++++++++++++++++++++++++++++++++++++++++ net/smc/smc.h | 6 ++++++ net/smc/smc_netlink.c | 15 ++++++++++++++ net/smc/smc_pnet.c | 3 +++ 6 files changed, 79 insertions(+) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index ea8a9cf2619b..47b166684fd8 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -12,5 +12,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; + + bool limit_smc_hs; /* constraint on handshake */ }; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 631087d32669..3c7278c6ef5d 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -59,6 +59,9 @@ enum { SMC_NETLINK_DUMP_SEID, SMC_NETLINK_ENABLE_SEID, SMC_NETLINK_DISABLE_SEID, + SMC_NETLINK_DUMP_HS_LIMITATION, + SMC_NETLINK_ENABLE_HS_LIMITATION, + SMC_NETLINK_DISABLE_HS_LIMITATION, }; /* SMC_GENL_FAMILY top level attributes */ @@ -283,6 +286,14 @@ enum { SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; +/* SMC_NETLINK_HS_LIMITATION attributes */ +enum { + SMC_NLA_HS_LIMITATION_UNSPEC, + SMC_NLA_HS_LIMITATION_ENABLED, /* u8 */ + __SMC_NLA_HS_LIMITATION_MAX, + SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1 +}; + /* SMC socket options */ #define SMC_LIMIT_HS 1 /* constraint on smc handshake */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 285451db2fd1..a00658af49d9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -66,6 +66,45 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + void *hdr; + + if (cb_ctx->pos[0]) + goto out; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_HS_LIMITATION); + if (!hdr) + return -ENOMEM; + + if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, + sock_net(skb->sk)->smc.limit_smc_hs)) + goto err; + + genlmsg_end(skb, hdr); + cb_ctx->pos[0] = 1; +out: + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = true; + return 0; +} + +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = false; + return 0; +} + static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); @@ -3011,6 +3050,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; + /* default behavior from limit_smc_hs in every net namespace */ + smc->limit_smc_hs = net->smc.limit_smc_hs; + rc = 0; if (!clcsock) { rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, diff --git a/net/smc/smc.h b/net/smc/smc.h index 7e2693832a1b..a096d8af21a0 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -14,6 +14,7 @@ #include #include #include /* __aligned */ +#include #include #include "smc_ib.h" @@ -336,4 +337,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); +/* smc handshake limitation interface for netlink */ +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); + #endif /* __SMC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index f13ab0661ed5..c5a62f6f52ba 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -111,6 +111,21 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_seid, }, + { + .cmd = SMC_NETLINK_DUMP_HS_LIMITATION, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_hs_limitation, + }, + { + .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_hs_limitation, + }, + { + .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_hs_limitation, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 13df00306182..4ded23241c20 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -867,6 +867,9 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; + return 0; } -- Gitee From 3b6d196fbefabc12bade4e60a691c7e580b531f5 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 11 Feb 2022 14:52:21 +0800 Subject: [PATCH 117/148] net/smc: Add comment for smc_tx_pending The previous patch introduces a lock-free version of smc_tx_work() to solve unnecessary lock contention, which is expected to be held lock. So this adds comment to remind people to keep an eye out for locks. Suggested-by: Stefan Raspl Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 75f7d770dec8..deeef8308e1c 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -612,6 +612,10 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit. The caller + * must hold sock lock. + */ void smc_tx_pending(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -627,7 +631,8 @@ void smc_tx_pending(struct smc_connection *conn) } /* Wakeup sndbuf consumers from process context - * since there is more data to transmit + * since there is more data to transmit in locked + * sock. */ void smc_tx_work(struct work_struct *work) { -- Gitee From c4921ca25fadba6f1177c94ad57984b38111d8cf Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 15 Feb 2022 16:24:50 +0800 Subject: [PATCH 118/148] net/smc: return ETIMEDOUT when smc_connect_clc() timeout When smc_connect_clc() times out, it will return -EAGAIN(tcp_recvmsg retuns -EAGAIN while timeout), then this value will passed to the application, which is quite confusing to the applications, makes inconsistency with TCP. From the manual of connect, ETIMEDOUT is more suitable, and this patch try convert EAGAIN to ETIMEDOUT in that case. Signed-off-by: D. Wythe Reviewed-by: Karsten Graul Link: https://lore.kernel.org/r/1644913490-21594-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a00658af49d9..cc717529e4bf 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1376,8 +1376,14 @@ static int __smc_connect(struct smc_sock *smc) /* perform CLC handshake */ rc = smc_connect_clc(smc, aclc2, ini); - if (rc) + if (rc) { + /* -EAGAIN on timeout, see tcp_recvmsg() */ + if (rc == -EAGAIN) { + rc = -ETIMEDOUT; + smc->sk.sk_err = ETIMEDOUT; + } goto vlan_cleanup; + } /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); -- Gitee From 8b848b3fb695ed500490575f015be3e11fc7a615 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 24 Feb 2022 21:19:06 +0800 Subject: [PATCH 119/148] net/smc: fix connection leak There's a potential leak issue under following execution sequence : smc_release smc_connect_work if (sk->sk_state == SMC_INIT) send_clc_confirim tcp_abort(); ... sk.sk_state = SMC_ACTIVE smc_close_active switch(sk->sk_state) { ... case SMC_ACTIVE: smc_close_final() // then wait peer closed Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are still in the tcp send buffer, in which case our connection token cannot be delivered to the server side, which means that we cannot get a passive close message at all. Therefore, it is impossible for the to be disconnected at all. This patch tries a very simple way to avoid this issue, once the state has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the smc connection, considering that the state is SMC_INIT before tcp_abort(), abandoning the complete disconnection process should not cause too much problem. In fact, this problem may exist as long as the CLC CONFIRM message is not received by the server. Whether a timer should be added after smc_close_final() needs to be discussed in the future. But even so, this patch provides a faster release for connection in above case, it should also be valuable. Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets") Signed-off-by: D. Wythe --- net/smc/af_smc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cc717529e4bf..ef0a875e689c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -268,7 +268,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; @@ -276,8 +276,10 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) @@ -291,6 +293,10 @@ static int smc_release(struct socket *sock) else lock_sock(sk); + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + rc = __smc_release(smc); /* detach socket */ -- Gitee From 94c39a0879ad7f2aa0bce478a678c551892c7c9e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Feb 2022 18:32:59 +0300 Subject: [PATCH 120/148] net/smc: unlock on error paths in __smc_setsockopt() These two error paths need to release_sock(sk) before returning. Fixes: a6a6fe27bab4 ("net/smc: Dynamic control handshake limitation by socket options") Signed-off-by: Dan Carpenter Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ef0a875e689c..bd3937875ecf 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2721,10 +2721,14 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: - if (optlen < sizeof(int)) - return -EINVAL; - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; + if (optlen < sizeof(int)) { + rc = -EINVAL; + break; + } + if (copy_from_sockptr(&val, optval, sizeof(int))) { + rc = -EFAULT; + break; + } smc->limit_smc_hs = !!val; rc = 0; -- Gitee From 2ddd042fd862c24c60bc750252d224b4e78bfcb2 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 25 Feb 2022 15:34:21 +0800 Subject: [PATCH 121/148] net/smc: Call trace_smc_tx_sendmsg when data corked This also calls trace_smc_tx_sendmsg() even if data is corked. For ease of understanding, if statements are not expanded here. Link: https://lore.kernel.org/all/f4166712-9a1e-51a0-409d-b7df25a66c52@linux.ibm.com/ Fixes: 139653bc6635 ("net/smc: Remove corked dealyed work") Suggested-by: Stefan Raspl Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index deeef8308e1c..fbb1c3eec97b 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -236,15 +236,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - (atomic_read(&conn->sndbuf_space))) - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. - */ - continue; - smc_tx_sndbuf_nonempty(conn); + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space))) + smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From b2c9d463a36721d7a6485d622ef7d6272cd402cc Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:56 +0800 Subject: [PATCH 122/148] net/smc: add sysctl interface for SMC This patch add sysctl interface to support container environment for SMC as we talk in the mail list. Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com Co-developed-by: Tony Lu Signed-off-by: Tony Lu Signed-off-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 3 ++ net/smc/Makefile | 2 +- net/smc/af_smc.c | 10 ++++++ net/smc/smc_sysctl.c | 70 +++++++++++++++++++++++++++++++++++++++++ net/smc/smc_sysctl.h | 32 +++++++++++++++++++ 5 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 net/smc/smc_sysctl.c create mode 100644 net/smc/smc_sysctl.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 47b166684fd8..1682eae50579 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -14,5 +14,8 @@ struct netns_smc { struct smc_stats_rsn *fback_rsn; bool limit_smc_hs; /* constraint on handshake */ +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif }; #endif diff --git a/net/smc/Makefile b/net/smc/Makefile index 196fb6f01b14..640af9a39f9c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bd3937875ecf..9a4f22db2908 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -51,6 +51,7 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" +#include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -3279,9 +3280,17 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_sysctl_init(); + if (rc) { + pr_err("%s: sysctl_init fails with %d\n", __func__, rc); + goto out_ulp; + } + static_branch_enable(&tcp_have_smc); return 0; +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -3309,6 +3318,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_sysctl_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..8a3a8e145976 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#include +#include +#include + +#include "smc_sysctl.h" + +static struct ctl_table smc_table[] = { + { } +}; + +static __net_init int smc_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void smc_sysctl_exit_net(struct net *net) +{ + unregister_net_sysctl_table(net->smc.smc_hdr); +} + +static struct pernet_operations smc_sysctl_ops __net_initdata = { + .init = smc_sysctl_init_net, + .exit = smc_sysctl_exit_net, +}; + +int __init smc_sysctl_init(void) +{ + return register_pernet_subsys(&smc_sysctl_ops); +} + +void smc_sysctl_exit(void) +{ + unregister_pernet_subsys(&smc_sysctl_ops); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 000000000000..49553ac236b6 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int smc_sysctl_init(void); +void smc_sysctl_exit(void); + +#else + +int smc_sysctl_init(void) +{ + return 0; +} + +void smc_sysctl_exit(void) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ -- Gitee From fb644c1d1ffdc04a894ed8d21d3cfd1c42f64fa5 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:57 +0800 Subject: [PATCH 123/148] net/smc: add autocorking support This patch adds autocorking support for SMC which could improve throughput for small message by x3+. The main idea is borrowed from TCP autocorking with some RDMA specific modification: 1. The first message should never cork to make sure we won't bring extra latency 2. If we have posted any Tx WRs to the NIC that have not completed, cork the new messages until: a) Receive CQE for the last Tx WR b) We have corked enough message on the connection 3. Try to push the corked data out when we receive CQE of the last Tx WR to prevent the corked messages hang in the send queue. Both SMC autocorking and TCP autocorking check the TX completion to decide whether we should cork or not. The difference is when we got a SMC Tx WR completion, the data have been confirmed by the RNIC while TCP TX completion just tells us the data have been sent out by the local NIC. Add an atomic variable tx_pushing in smc_connection to make sure only one can send to let it cork more and save CDC slot. SMC autocorking should not bring extra latency since the first message will always been sent out immediately. The qperf tcp_bw test shows more than x4 increase under small message size with Mellanox connectX4-Lx, same result with other throughput benchmarks like sockperf/netperf. The qperf tcp_lat test shows SMC autocorking has not increase any ping-pong latency. Test command: client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \ -t 30 -vu tcp_{bw|lat} server: smc_run taskset -c 1 qperf === Bandwidth ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 0.578 MB/s 2.392 MB/s(313.57%) 2.647 MB/s(357.72%) 2 1.159 MB/s 4.780 MB/s(312.53%) 5.153 MB/s(344.71%) 4 2.283 MB/s 10.266 MB/s(349.77%) 10.363 MB/s(354.02%) 8 4.668 MB/s 19.040 MB/s(307.86%) 21.215 MB/s(354.45%) 16 9.147 MB/s 38.904 MB/s(325.31%) 41.740 MB/s(356.32%) 32 18.369 MB/s 79.587 MB/s(333.25%) 82.392 MB/s(348.52%) 64 36.562 MB/s 148.668 MB/s(306.61%) 161.564 MB/s(341.89%) 128 72.961 MB/s 274.913 MB/s(276.80%) 325.363 MB/s(345.94%) 256 144.705 MB/s 512.059 MB/s(253.86%) 633.743 MB/s(337.96%) 512 288.873 MB/s 884.977 MB/s(206.35%) 1250.681 MB/s(332.95%) 1024 574.180 MB/s 1337.736 MB/s(132.98%) 2246.121 MB/s(291.19%) 2048 1095.192 MB/s 1865.952 MB/s( 70.38%) 2057.767 MB/s( 87.89%) 4096 2066.157 MB/s 2380.337 MB/s( 15.21%) 2173.983 MB/s( 5.22%) 8192 3717.198 MB/s 2733.073 MB/s(-26.47%) 3491.223 MB/s( -6.08%) 16384 4742.221 MB/s 2958.693 MB/s(-37.61%) 4637.692 MB/s( -2.20%) 32768 5349.550 MB/s 3061.285 MB/s(-42.77%) 5385.796 MB/s( 0.68%) 65536 5162.919 MB/s 3731.408 MB/s(-27.73%) 5223.890 MB/s( 1.18%) ==== Latency ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 10.540 us 11.938 us( 13.26%) 10.573 us( 0.31%) 2 10.996 us 11.992 us( 9.06%) 10.269 us( -6.61%) 4 10.229 us 11.687 us( 14.25%) 10.240 us( 0.11%) 8 10.203 us 11.653 us( 14.21%) 10.402 us( 1.95%) 16 10.530 us 11.313 us( 7.44%) 10.599 us( 0.66%) 32 10.241 us 11.586 us( 13.13%) 10.223 us( -0.18%) 64 10.693 us 11.652 us( 8.97%) 10.251 us( -4.13%) 128 10.597 us 11.579 us( 9.27%) 10.494 us( -0.97%) 256 10.409 us 11.957 us( 14.87%) 10.710 us( 2.89%) 512 11.088 us 12.505 us( 12.78%) 10.547 us( -4.88%) 1024 11.240 us 12.255 us( 9.03%) 10.787 us( -4.03%) 2048 11.485 us 16.970 us( 47.76%) 11.256 us( -1.99%) 4096 12.077 us 13.948 us( 15.49%) 12.230 us( 1.27%) 8192 13.683 us 16.693 us( 22.00%) 13.786 us( 0.75%) 16384 16.470 us 23.615 us( 43.38%) 16.459 us( -0.07%) 32768 22.540 us 40.966 us( 81.75%) 23.284 us( 3.30%) 65536 34.192 us 73.003 us(113.51%) 34.233 us( 0.12%) With SMC autocorking support, we can archive better throughput than TCP in most message sizes without any latency trade-off. Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc.h | 2 + net/smc/smc_cdc.c | 11 +++-- net/smc/smc_tx.c | 107 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 105 insertions(+), 15 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index a096d8af21a0..e266b04b7585 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -29,6 +29,7 @@ #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; @@ -192,6 +193,7 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 9d5a97168969..2b37bec90824 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,9 +48,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && - unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If this is the last pending WR complete, we must push to + * prevent hang when autocork enabled. + */ + smc_tx_sndbuf_nonempty(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index fbb1c3eec97b..35cb0972dc12 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -131,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc) return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min(SMC_AUTOCORKING_DEFAULT_SIZE, + conn->sndbuf_desc->len >> 1); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -236,13 +281,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion */ - if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - atomic_read(&conn->sndbuf_space))) + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); @@ -590,13 +632,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -604,10 +659,38 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } + +out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } -- Gitee From 7b0001f52796294fea138672ef87b392281ea995 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:58 +0800 Subject: [PATCH 124/148] net/smc: add sysctl for autocorking This add a new sysctl: net.smc.autocorking_size We can dynamically change the behaviour of autocorking by change the value of autocorking_size. Setting to 0 disables autocorking in SMC Signed-off-by: Dust Li Signed-off-by: David S. Miller --- Documentation/networking/smc-sysctl.rst | 23 +++++++++++++++++++++++ include/net/netns/smc.h | 1 + net/smc/smc_sysctl.c | 10 ++++++++++ net/smc/smc_tx.c | 2 +- 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 Documentation/networking/smc-sysctl.rst diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst new file mode 100644 index 000000000000..c53f8c61c9e4 --- /dev/null +++ b/Documentation/networking/smc-sysctl.rst @@ -0,0 +1,23 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +SMC Sysctl +========= + +/proc/sys/net/smc/* Variables +============================== + +autocorking_size - INTEGER + Setting SMC auto corking size: + SMC auto corking is like TCP auto corking from the application's + perspective of view. When applications do consecutive small + write()/sendmsg() system calls, we try to coalesce these small writes + as much as possible, to lower total amount of CDC and RDMA Write been + sent. + autocorking_size limits the maximum corked bytes that can be sent to + the under device in 1 single sending. If set to 0, the SMC auto corking + is disabled. + Applications can still use TCP_CORK for optimal behavior when they + know how/when to uncork their sockets. + + Default: 64K diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 1682eae50579..e5389eeaf8bd 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,5 +17,6 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_autocorking_size; }; #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 8a3a8e145976..3b59876aaac9 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -14,9 +14,17 @@ #include #include +#include "smc.h" #include "smc_sysctl.h" static struct ctl_table smc_table[] = { + { + .procname = "autocorking_size", + .data = &init_net.smc.sysctl_autocorking_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { } }; @@ -40,6 +48,8 @@ static __net_init int smc_sysctl_init_net(struct net *net) if (!net->smc.smc_hdr) goto err_reg; + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + return 0; err_reg: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 35cb0972dc12..fd01a2ea9846 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -147,7 +147,7 @@ static bool smc_should_autocork(struct smc_sock *smc) struct smc_connection *conn = &smc->conn; int corking_size; - corking_size = min(SMC_AUTOCORKING_DEFAULT_SIZE, + corking_size = min(sock_net(&smc->sk)->smc.sysctl_autocorking_size, conn->sndbuf_desc->len >> 1); if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || -- Gitee From a0cc59e6d10a93caf61e48cd877608a8d84cd2fa Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:59 +0800 Subject: [PATCH 125/148] net/smc: send directly on setting TCP_NODELAY In commit ea785a1a573b("net/smc: Send directly when TCP_CORK is cleared"), we don't use delayed work to implement cork. This patch use the same algorithm, removes the delayed work when setting TCP_NODELAY and send directly in setsockopt(). This also makes the TCP_NODELAY the same as TCP. Cc: Tony Lu Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9a4f22db2908..21d23feaed3c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2802,8 +2802,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_CLOSED) { if (val) { SMC_STAT_INC(smc, ndly_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); } } break; -- Gitee From e11a0d809b4927bb0838fa1b3b992a68787afd37 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:44:00 +0800 Subject: [PATCH 126/148] net/smc: correct settings of RMB window update limit rmbe_update_limit is used to limit announcing receive window updating too frequently. RFC7609 request a minimal increase in the window size of 10% of the receive buffer space. But current implementation used: min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2) and SOCK_MIN_SNDBUF / 2 == 2304 Bytes, which is almost always less then 10% of the receive buffer space. This causes the receiver always sending CDC message to update its consumer cursor when it consumes more then 2K of data. And as a result, we may encounter something like "TCP silly window syndrome" when sending 2.5~8K message. This patch fixes this using max(rmbe_size / 10, SOCK_MIN_SNDBUF / 2). With this patch and SMC autocorking enabled, qperf 2K/4K/8K tcp_bw test shows 45%/75%/40% increase in throughput respectively. Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index aa4f16718fa1..9b162a33fea9 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2004,7 +2004,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { - return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); + return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an rmb buf to a link */ -- Gitee From 5e3ee9818d2506455fb5d5e48e7ce76f5c240342 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:44:02 +0800 Subject: [PATCH 127/148] net/smc: don't send in the BH context if sock_owned_by_user Send data all the way down to the RDMA device is a time consuming operation(get a new slot, maybe do RDMA Write and send a CDC, etc). Moving those operations from BH to user context is good for performance. If the sock_lock is hold by user, we don't try to send data out in the BH context, but just mark we should send. Since the user will release the sock_lock soon, we can do the sending there. Add smc_release_cb() which will be called in release_sock() and try send in the callback if needed. This patch moves the sending part out from BH if sock lock is hold by user. In my testing environment, this saves about 20% softirq in the qperf 4K tcp_bw test in the sender side with no noticeable throughput drop. Signed-off-by: Dust Li Signed-off-by: David S. Miller --- net/smc/af_smc.c | 16 ++++++++++++++++ net/smc/smc.h | 4 ++++ net/smc/smc_cdc.c | 19 ++++++++++++++----- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 21d23feaed3c..fdaf2b33d097 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -193,12 +193,27 @@ void smc_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(smc_unhash_sk); +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -211,6 +226,7 @@ struct proto smc_proto6 = { .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, diff --git a/net/smc/smc.h b/net/smc/smc.h index e266b04b7585..ea0620529ebe 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -213,6 +213,10 @@ struct smc_connection { * data still pending */ char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 2b37bec90824..5c731f27996e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -49,10 +49,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, } if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { - /* If this is the last pending WR complete, we must push to - * prevent hang when autocork enabled. + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() */ - smc_tx_sndbuf_nonempty(conn); + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) wake_up(&conn->cdc_pend_tx_wq); } @@ -355,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || - conn->local_rx_ctrl.prod_flags.urg_data_pending) - smc_tx_sndbuf_nonempty(conn); + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } if (diff_cons && conn->urg_tx_pend && atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { -- Gitee From 5a5cd75730b5242e9cdb3a52b3801eccc9ee96b7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 1 Mar 2022 14:24:46 -0800 Subject: [PATCH 128/148] net: smc: fix different types in min() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build: include/linux/minmax.h:45:25: note: in expansion of macro ‘__careful_cmp’ 45 | #define min(x, y) __careful_cmp(x, y, <) | ^~~~~~~~~~~~~ net/smc/smc_tx.c:150:24: note: in expansion of macro ‘min’ 150 | corking_size = min(sock_net(&smc->sk)->smc.sysctl_autocorking_size, | ^~~ Fixes: 12bbb0d163a9 ("net/smc: add sysctl for autocorking") Link: https://lore.kernel.org/r/20220301222446.1271127-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/smc/smc_tx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index fd01a2ea9846..ea0ca6bcb37b 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -147,8 +147,8 @@ static bool smc_should_autocork(struct smc_sock *smc) struct smc_connection *conn = &smc->conn; int corking_size; - corking_size = min(sock_net(&smc->sk)->smc.sysctl_autocorking_size, - conn->sndbuf_desc->len >> 1); + corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, + sock_net(&smc->sk)->smc.sysctl_autocorking_size); if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || smc_tx_prepared_sends(conn) > corking_size) -- Gitee From 905de9c59f850644e789489fe27bcc731f1dbba5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:11 +0800 Subject: [PATCH 129/148] net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error generated by client The main reason for this unexpected SMC_CLC_DECL_ERR_REGRMB in client dues to following execution sequence: Server Conn A: Server Conn B: Client Conn B: smc_lgr_unregister_conn smc_lgr_register_conn smc_clc_send_accept -> smc_rtoken_add smcr_buf_unuse -> Client Conn A: smc_rtoken_delete smc_lgr_unregister_conn() makes current link available to assigned to new incoming connection, while smcr_buf_unuse() has not executed yet, which means that smc_rtoken_add may fail because of insufficient rtoken_entry, reversing their execution order will avoid this problem. Fixes: 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9b162a33fea9..b0ddf2969e8d 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1177,8 +1177,8 @@ void smc_conn_free(struct smc_connection *conn) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { - smc_lgr_unregister_conn(conn); smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) -- Gitee From d67e376f934eae912af1b11580841129c0ac006f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:12 +0800 Subject: [PATCH 130/148] net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error cause by server The problem of SMC_CLC_DECL_ERR_REGRMB on the server is very clear. Based on the fact that whether a new SMC connection can be accepted or not depends on not only the limit of conn nums, but also the available entries of rtoken. Since the rtoken release is trigger by peer, while the conn nums is decrease by local, tons of thing can happen in this time difference. This only thing that needs to be mentioned is that now all connection creations are completely protected by smc_server_lgr_pending lock, it's enough to check only the available entries in rtokens_used_mask. Fixes: cd6851f30386 ("smc: remote memory buffers (RMBs)") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index b0ddf2969e8d..45234b3877ef 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1880,7 +1880,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; -- Gitee From bda9a3111ef568db04b10bd54d75814f49e677c8 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 8 Dec 2021 21:31:33 +0800 Subject: [PATCH 131/148] anolis: net/smc: Introduce iWARP device support ANBZ: #51 This patch includes RDMA_NODE_RNIC as SMC-R's ib-device node_type and expands determine-gid logic for iWARP device which doesn't support VLAN. Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Dust Li --- net/smc/smc_ib.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 519884ecebab..634db2ea0050 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -223,7 +223,7 @@ static int smc_ib_determine_gid_rcu(const struct net_device *ndev, u8 gid[], u8 *sgid_index, struct smc_init_info_smcrv2 *smcrv2) { - if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { + if (!smcrv2) { if (gid) memcpy(gid, &attr->gid, SMC_GID_SIZE); if (sgid_index) @@ -278,10 +278,11 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); - if (!IS_ERR(ndev) && + if ((smcibdev->ibdev->port_data[ibport].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_IWARP) || (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(ndev)) || (vlan_id && is_vlan_dev(ndev) && - vlan_dev_vlan_id(ndev) == vlan_id))) { + vlan_dev_vlan_id(ndev) == vlan_id)))) { if (!smc_ib_determine_gid_rcu(ndev, attr, gid, sgid_index, smcrv2)) { rcu_read_unlock(); @@ -905,7 +906,8 @@ static int smc_ib_add_dev(struct ib_device *ibdev) u8 port_cnt; int i; - if (ibdev->node_type != RDMA_NODE_IB_CA) + if (ibdev->node_type != RDMA_NODE_IB_CA && + ibdev->node_type != RDMA_NODE_RNIC) return -EOPNOTSUPP; smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); -- Gitee From 774c517ab23d7f3c369977f98c9557ede21787fd Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 8 Dec 2021 21:45:03 +0800 Subject: [PATCH 132/148] anolis: net/smc: Supplement for SMC-R iWARP support ANBZ: #38 SMC-R currently uses qp_context to carry iWARP extended information when ib_create_qp() is invoked. However, eRDMA driver has no way to distinguish whether qp_context should be resolved as iWARP extended information. So this patch introduces a new create_flags bit to indicate it, which will be set by SMC-R when creating an iWARP qp. Signed-off-by: Wen Gu Acked-by: Dust Li --- include/rdma/ib_verbs.h | 2 ++ net/smc/smc_ib.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c8cbf7e39a78..a155f6d28ce2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1133,6 +1133,8 @@ enum ib_qp_create_flags { IB_QP_CREATE_SOURCE_QPN = 1 << 10, IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, + + IB_QP_CREATE_IWARP_WITHOUT_CM = 1 << 25, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 634db2ea0050..65bf38cac7fd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -674,8 +674,14 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, }; + struct ib_device *ib_dev = lnk->smcibdev->ibdev; + struct ib_port_immutable immutable; int rc; + ib_dev->ops.get_port_immutable(ib_dev, lnk->ibport, &immutable); + if (immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP) + qp_attr.create_flags |= IB_QP_CREATE_IWARP_WITHOUT_CM; + lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); if (IS_ERR(lnk->roce_qp)) -- Gitee From d6a870fe958d272428fee9888a11e1568d428233 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 13:01:48 +0800 Subject: [PATCH 133/148] anolis: net/smc: Introduce tunable sysctls for sndbuf and RMB size ANBZ: #51 This patch introduces sysctls for SMC, and sperates {w|r}mem_default knobs from net.core and net.ipv4 to SMC. SMC connections' sndbuf and RMB are tunable with sysctl net.smc.{w|r}mem_default. Signed-off-by: Tony Lu Reviewed-by: Xuan Zhuo Acked-by: Dust Li --- include/net/netns/smc.h | 2 ++ net/smc/af_smc.c | 22 +++++++++++++++++++--- net/smc/smc_sysctl.c | 20 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index e5389eeaf8bd..3ffaddd1ff12 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -18,5 +18,7 @@ struct netns_smc { struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_autocorking_size; + int sysctl_wmem_default; + int sysctl_rmem_default; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fdaf2b33d097..652cf8c060e0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -353,6 +354,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + sk->sk_sndbuf = net->smc.sysctl_wmem_default; + sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -3098,9 +3101,6 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); - out: return rc; } @@ -3179,6 +3179,13 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { + if (net != &init_net) { + net->smc.sysctl_wmem_default = + init_net.smc.sysctl_rmem_default; + net->smc.sysctl_rmem_default = + init_net.smc.sysctl_rmem_default; + } + return smc_pnet_net_init(net); } @@ -3212,6 +3219,8 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { int rc; + int max_rshare, max_wshare; + unsigned long limit; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3296,6 +3305,13 @@ static int __init smc_init(void) goto out_sock; } + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); + max_wshare = min(4UL * 1024 * 1024, limit); + max_rshare = min(6UL * 1024 * 1024, limit); + + init_net.smc.sysctl_wmem_default = 256 * 1024; + init_net.smc.sysctl_rmem_default = 384 * 1024; + rc = smc_sysctl_init(); if (rc) { pr_err("%s: sysctl_init fails with %d\n", __func__, rc); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 3b59876aaac9..0b0fd044c274 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -16,6 +16,10 @@ #include "smc.h" #include "smc_sysctl.h" +#include "smc_core.h" + +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; static struct ctl_table smc_table[] = { { @@ -25,6 +29,22 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, + { + .procname = "wmem_default", + .data = &init_net.smc.sysctl_wmem_default, + .maxlen = sizeof(init_net.smc.sysctl_wmem_default), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &init_net.smc.sysctl_rmem_default, + .maxlen = sizeof(init_net.smc.sysctl_rmem_default), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, { } }; -- Gitee From 02ab3764975926c0ef04189d6adf66e8a40b905f Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:15:06 +0800 Subject: [PATCH 134/148] anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace ANBZ: #51 This patch exposes SMCPROTO_SMC and SMCPROTO_SMC6 to userspace by moving them to in.h and in6.h. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li --- include/uapi/linux/in.h | 3 +++ include/uapi/linux/in6.h | 2 ++ net/smc/smc.h | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index d1b327036ae4..40b1e51b18c9 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,6 +84,9 @@ enum { }; #endif +/* SMC protocol, IPv4 */ +#define SMCPROTO_SMC 0 + #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 5ad396a57eb3..6c21c85be0e3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,6 +95,8 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 +/* SMC protocol, IPv6 */ +#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/smc.h b/net/smc/smc.h index ea0620529ebe..0b7f39df2449 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -22,10 +22,6 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 - -#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ -#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ - #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From ca55e1e4fb7e397bfd6bd04ce7c5bb6ea156e313 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:16:24 +0800 Subject: [PATCH 135/148] anolis: net/smc: Introduce sysctl tcp2smc ANBZ: #51 This patch adds sysctl 'tcp2smc' to provide a switch for replacing TCP to SMC-R when new sockets are created in a specific net namespace. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 3 +++ net/smc/smc_sysctl.c | 7 +++++++ net/socket.c | 8 ++++++++ 4 files changed, 19 insertions(+) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 3ffaddd1ff12..364d0e250734 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -20,5 +20,6 @@ struct netns_smc { unsigned int sysctl_autocorking_size; int sysctl_wmem_default; int sysctl_rmem_default; + int sysctl_tcp2smc; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 652cf8c060e0..70f6104c5e81 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3184,6 +3184,7 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; + net->smc.sysctl_tcp2smc = 0; } return smc_pnet_net_init(net); @@ -3191,6 +3192,7 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { + net->smc.sysctl_tcp2smc = 0; smc_pnet_net_exit(net); } @@ -3311,6 +3313,7 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; + init_net.smc.sysctl_tcp2smc = 0; rc = smc_sysctl_init(); if (rc) { diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 0b0fd044c274..a064f76f5878 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -45,6 +45,13 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, + { + .procname = "tcp2smc", + .data = &init_net.smc.sysctl_tcp2smc, + .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/socket.c b/net/socket.c index d52c265ad449..96860a0f9330 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1367,6 +1367,14 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } +#if IS_ENABLED(CONFIG_SMC) + if (!kern && (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } +#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From 9c9a94132a062e3f968b8cc211c06705383cccfc Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:33:42 +0800 Subject: [PATCH 136/148] anolis: net/smc: Introduce SMC-R-related proc files ANBZ: #51 This patch introduces SMC-R proc files to report statistics information of SMC-R connections. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li --- include/net/net_namespace.h | 1 + include/net/smc.h | 5 +- net/smc/Makefile | 2 +- net/smc/af_smc.c | 29 +++- net/smc/smc_diag.c | 29 ++-- net/smc/smc_proc.c | 287 ++++++++++++++++++++++++++++++++++++ net/smc/smc_proc.h | 34 +++++ 7 files changed, 364 insertions(+), 23 deletions(-) create mode 100644 net/smc/smc_proc.c create mode 100644 net/smc/smc_proc.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 76e9cce289a4..220878bfe86b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,6 +95,7 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; + struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/smc.h b/include/net/smc.h index e441aa97ad61..743b4fe74346 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -12,10 +12,13 @@ #define _SMC_H #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ +#define SMC_HTABLE_SHIFT 9 +#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { + unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht; + struct hlist_head ht[SMC_HTABLE_SIZE]; }; int smc_hash_sk(struct sock *sk); diff --git a/net/smc/Makefile b/net/smc/Makefile index 640af9a39f9c..19076ff20d58 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o +smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 70f6104c5e81..41fe4f341128 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,7 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -172,11 +173,13 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - head = &h->ht; - write_lock_bh(&h->lock); + + head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); return 0; @@ -3220,7 +3223,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc; + int rc, i; int max_rshare, max_wshare; unsigned long limit; @@ -3292,19 +3295,28 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); + + for (i = 0; i < SMC_HTABLE_SIZE; i++) { + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); + } + + rc = smc_proc_init(); + if (rc) { + pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); + goto out_sock; + } rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_proc; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_proc; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -3326,6 +3338,8 @@ static int __init smc_init(void) out_ulp: tcp_unregister_ulp(&smc_ulp_ops); +out_proc: + smc_proc_exit(); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -3355,6 +3369,7 @@ static void __exit smc_exit(void) static_branch_disable(&tcp_have_smc); smc_sysctl_exit(); tcp_unregister_ulp(&smc_ulp_ops); + smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 25ef26b621a2..8d436e42a85b 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,24 +196,25 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0; + int rc = 0, num = 0, slot; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - head = &prot->h.smc_hash->ht; - if (hlist_empty(head)) - goto out; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + + for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { + head = &prot->h.smc_hash->ht[slot]; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; + num++; + } } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c new file mode 100644 index 000000000000..19d8cc82a7ac --- /dev/null +++ b/net/smc/smc_proc.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include "smc.h" +#include "smc_proc.h" +#include "smc_core.h" + +static void *smc_get_next(struct seq_file *seq, void *cur) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + struct net *net = seq_file_net(seq); + struct hlist_head *head; + struct sock *sk = cur; + + if (!sk) { + read_lock(&smc_hash->lock); +get_head: + head = &smc_hash->ht[sp->bucket]; + sk = sk_head(head); + sp->offset = 0; + goto get_sk; + } + ++sp->num; + ++sp->offset; + + sk = sk_next(sk); +get_sk: + sk_for_each_from(sk) { + if (!net_eq(sock_net(sk), net)) + continue; + return sk; + } + sp->offset = 0; + if (++sp->bucket < SMC_HTABLE_SIZE) + goto get_head; + + read_unlock(&smc_hash->lock); + return NULL; +} + +static void *smc_seek_last_pos(struct seq_file *seq) +{ + struct smc_proc_private *sp = seq->private; + int offset = sp->offset; + int orig_num = sp->num; + void *rc = NULL; + + if (sp->bucket >= SMC_HTABLE_SIZE) + goto out; + + rc = smc_get_next(seq, NULL); + while (offset-- && rc) + rc = smc_get_next(seq, rc); + + if (rc) + goto out; + + sp->bucket = 0; +out: + sp->num = orig_num; + return rc; +} + +static void *smc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + sp->bucket = 0; + rc = smc_get_next(seq, NULL); + + while (rc && pos) { + rc = smc_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + if (*pos && *pos == sp->last_pos) { + rc = smc_seek_last_pos(seq); + if (rc) + goto out; + } + + sp->num = 0; + sp->bucket = 0; + sp->offset = 0; + sp->protocol = protocol; + rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + sp->last_pos = *pos; + return rc; +} + +static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC); +} + +static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC6); +} + +static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) +{ + struct smc_proc_private *sp = seq->private; + const struct in6_addr *dest, *src; + struct smc_link_group *lgr; + struct socket *clcsock; + struct smc_link *lnk; + struct sock *sk; + bool fb = false; + int i; + + fb = smc->use_fallback; + clcsock = smc->clcsock; + sk = &smc->sk; + + if (protocol == SMCPROTO_SMC) + seq_printf(seq, CONN4_ADDR_FM, sp->num, + clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, + clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); + else if (protocol == SMCPROTO_SMC6) { + dest = &clcsock->sk->sk_v6_daddr; + src = &clcsock->sk->sk_v6_rcv_saddr; + seq_printf(seq, CONN6_ADDR_FM, sp->num, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); + } + + seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, + sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); + + lgr = smc->conn.lgr; + lnk = smc->conn.lnk; + + if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { + for (i = 0; i < SMC_LGR_ID_SIZE; i++) + seq_printf(seq, "%02X", lgr->id[i]); + + seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', + lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, + lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + } else { + seq_puts(seq, "- - - - - - - -\n"); + } +} + +static int smc_conn_show(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct socket *clcsock; + struct smc_sock *smc; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, + "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", + "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", + "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + goto out; + } + + smc = smc_sk(v); + clcsock = smc->clcsock; + if (!clcsock) + goto out; + + _conn_show(seq, smc, sp->protocol); +out: + return 0; +} + +static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc = NULL; + + if (v == SEQ_START_TOKEN) { + rc = smc_get_idx(seq, 0); + goto out; + } + rc = smc_get_next(seq, v); +out: + ++*pos; + sp->last_pos = *pos; + return rc; +} + +static void smc_conn_stop(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + + if (v && v != SEQ_START_TOKEN) + read_unlock(&smc_hash->lock); +} + +static struct smc_proc_entry smc_proc[] = { + { + .name = "smc4", + .ops = { + .show = smc_conn_show, + .start = smc_conn4_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .name = "smc6", + .ops = { + .show = smc_conn_show, + .start = smc_conn6_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#endif +}; + +static int __net_init smc_proc_dir_init(struct net *net) +{ + int i, rc = -ENOMEM; + + net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); + if (!net->proc_net_smc) + goto err; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { + if (!proc_create_net_data(smc_proc[i].name, 0444, + net->proc_net_smc, &smc_proc[i].ops, + sizeof(struct smc_proc_private), + NULL)) + goto err_entry; + } + + return 0; + +err_entry: + for (i -= 1; i >= 0; i--) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +err: + return rc; +} + +static void __net_exit smc_proc_dir_exit(struct net *net) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +} + +static struct pernet_operations smc_proc_ops = { + .init = smc_proc_dir_init, + .exit = smc_proc_dir_exit, +}; + +int __init smc_proc_init(void) +{ + return register_pernet_subsys(&smc_proc_ops); +} + +void smc_proc_exit(void) +{ + unregister_pernet_subsys(&smc_proc_ops); +} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h new file mode 100644 index 000000000000..ec59ca03e163 --- /dev/null +++ b/net/smc/smc_proc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _SMC_PROC_H_ +#define _SMC_PROC_H_ + +#include +#include +#include +#include +#include +#include "smc.h" + +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") +#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") + +struct smc_proc_private { + struct seq_net_private p; + int num, bucket, offset; + int protocol; + loff_t last_pos; +}; + +struct smc_proc_entry { + const char *name; + const struct seq_operations ops; +}; + +int __init smc_proc_init(void); +void smc_proc_exit(void); + +#endif -- Gitee From d0b7379876f65cc8245c9113a8aa4c0b6d2cd7d5 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:36:07 +0800 Subject: [PATCH 137/148] anolis: net/smc: Introduce TCP to SMC replacement netlink commands ANBZ: #51 This patch introduces new SMC-R generic netlink commands SMC_NETLINK_{ ADD | DEL | GET }_TCP2SMC_WLIST to add | delete | get application-oriented TCP-to-SMC replacement white list. Comparison between Average time cost of creating or destroying 2000 TCP connections in different situations: 1) Without this patch and remove the patch which introduces TCP2SMC sysctl: Average creation time cost: 1106 us; Average destruction time cost: 6 us; 2) With this patch but not load SMC module: Average creation time cost: 1161 us; Average destruction time cost: 6 us; 3) With this patch and load SMC module: Average creation time cost: 1157 us; Average destruction time cost: 6 us; 4) With this patch, load SMC module and add 2 elements in TCP2SMC conversion white list: Average creation time cost: 1177 us; Average destruction time cost: 6 us; Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li --- include/net/netns/smc.h | 9 +- include/uapi/linux/smc.h | 3 + net/smc/Makefile | 2 +- net/smc/af_smc.c | 14 ++- net/smc/smc_conv.c | 186 +++++++++++++++++++++++++++++++++++++++ net/smc/smc_conv.h | 22 +++++ net/smc/smc_netlink.c | 19 +++- net/smc/smc_netlink.h | 5 ++ net/socket.c | 39 ++++++-- 9 files changed, 288 insertions(+), 11 deletions(-) create mode 100644 net/smc/smc_conv.c create mode 100644 net/smc/smc_conv.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 364d0e250734..60facba8cf22 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,14 +6,21 @@ struct smc_stats_rsn; struct smc_stats; +struct smc_convert { + int wlist_len; + struct mutex wlist_lock; + struct list_head wlist; + int (*smc_conv_match_rcu)(struct net *net, char *comm); +}; + struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - bool limit_smc_hs; /* constraint on handshake */ + struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 3c7278c6ef5d..759bcb2ff03e 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -62,6 +62,9 @@ enum { SMC_NETLINK_DUMP_HS_LIMITATION, SMC_NETLINK_ENABLE_HS_LIMITATION, SMC_NETLINK_DISABLE_HS_LIMITATION, + SMC_NETLINK_ADD_TCP2SMC_WLIST, + SMC_NETLINK_DEL_TCP2SMC_WLIST, + SMC_NETLINK_GET_TCP2SMC_WLIST, }; /* SMC_GENL_FAMILY top level attributes */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 19076ff20d58..72b3c934e473 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o +smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o smc_conv.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 41fe4f341128..2b4010c186f8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -54,6 +54,7 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_proc.h" +#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -3307,16 +3308,22 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_conv_init(); + if (rc) { + pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); + goto out_proc; + } + rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_proc; + goto out_conv; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_proc; + goto out_conv; } limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); @@ -3338,6 +3345,8 @@ static int __init smc_init(void) out_ulp: tcp_unregister_ulp(&smc_ulp_ops); +out_conv: + smc_conv_exit(); out_proc: smc_proc_exit(); out_sock: @@ -3369,6 +3378,7 @@ static void __exit smc_exit(void) static_branch_disable(&tcp_have_smc); smc_sysctl_exit(); tcp_unregister_ulp(&smc_ulp_ops); + smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c new file mode 100644 index 000000000000..e1f87d1de8a5 --- /dev/null +++ b/net/smc/smc_conv.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include "smc_netlink.h" +#include "smc_conv.h" + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *wlist_elem, *tmp; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + if (*wlist_len >= SMC_MAX_WLIST_LEN) { + mutex_unlock(wlist_lock); + return -EINVAL; + } + + list_for_each_entry(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) + goto out; + } + + wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); + if (!wlist_elem) { + mutex_unlock(wlist_lock); + return -ENOMEM; + } + + strcpy(wlist_elem->task_comm, msg); + list_add_tail_rcu(&wlist_elem->list, wlist); + ++*wlist_len; +out: + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *tmp, *nxt; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + list_for_each_entry_safe(tmp, nxt, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) { + list_del_rcu(&tmp->list); + synchronize_rcu(); + kfree(tmp); + --*wlist_len; + break; + } + } + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_conv_wlist_elem *tmp; + void *nlh; + + if (cb_ctx->pos[0]) + goto errmsg; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_TCP2SMC_WLIST); + if (!nlh) + goto errmsg; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, + nla_total_size(strlen(tmp->task_comm) + 1), + tmp->task_comm)) { + rcu_read_unlock(); + goto errattr; + } + } + rcu_read_unlock(); + + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_match_tcp2smc_wlist(struct net *net, char *comm) +{ + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_conv_wlist_elem *tmp; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, comm)) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + return -1; +} + +static int __net_init smc_net_conv_init(struct net *net) +{ + INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); + net->smc.smc_conv.wlist_len = 0; + + mutex_init(&net->smc.smc_conv.wlist_lock); + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, + smc_match_tcp2smc_wlist); + return 0; +} + +static void __net_exit smc_net_conv_exit(struct net *net) +{ + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *cur, *nxt; + struct list_head tmp_list; + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); + synchronize_rcu(); + + INIT_LIST_HEAD(&tmp_list); + + mutex_lock(wlist_lock); + list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); + *wlist_len = 0; + mutex_unlock(wlist_lock); + + list_for_each_entry_safe(cur, nxt, &tmp_list, list) { + list_del(&cur->list); + kfree(cur); + } +} + +static struct pernet_operations smc_conv_ops = { + .init = smc_net_conv_init, + .exit = smc_net_conv_exit, +}; + +int __init smc_conv_init(void) +{ + return register_pernet_subsys(&smc_conv_ops); +} + +void smc_conv_exit(void) +{ + unregister_pernet_subsys(&smc_conv_ops); +} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h new file mode 100644 index 000000000000..1615b27feede --- /dev/null +++ b/net/smc/smc_conv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef NET_SMC_SMC_CONV_H_ +#define NET_SMC_SMC_CONV_H_ +#include +#include +#include + +#define SMC_MAX_WLIST_LEN 32 + +struct smc_conv_wlist_elem { + char task_comm[TASK_COMM_LEN]; + struct list_head list; +}; + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); +int __init smc_conv_init(void); +void smc_conv_exit(void); + +#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index c5a62f6f52ba..52dba083b70e 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,6 +22,7 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" +#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -126,9 +127,25 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_hs_limitation, }, + { + .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_add_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_del_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_tcp2smc_wlist, + }, }; -static const struct nla_policy smc_gen_nl_policy[2] = { +static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { + [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index e8c6c3f0e98c..aae13737095e 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,6 +15,11 @@ #include #include +enum { + SMC_CMD_ATTR_TCP2SMC = 1, + SMC_CMD_MAX_ATTR, +}; + extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; diff --git a/net/socket.c b/net/socket.c index 96860a0f9330..3917e02b2b2f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -141,6 +141,38 @@ static void sock_show_fdinfo(struct seq_file *m, struct file *f) #define sock_show_fdinfo NULL #endif +#if IS_ENABLED(CONFIG_SMC) +static bool try_tcp2smc_convert(struct net *net, int *family, int type, + int *protocol, int kern) +{ + int (*f)(struct net *n, char *c) = NULL; + + /* Only convert userspace socket */ + if (kern) + return false; + + if ((*family == AF_INET || *family == AF_INET6) && + type == SOCK_STREAM && + (*protocol == IPPROTO_IP || *protocol == IPPROTO_TCP)) { + if (net->smc.sysctl_tcp2smc) + goto convert; + + rcu_read_lock(); + f = rcu_dereference(net->smc.smc_conv.smc_conv_match_rcu); + if (f && !f(net, current->comm)) { + rcu_read_unlock(); + goto convert; + } + rcu_read_unlock(); + } + return false; +convert: + *protocol = (*family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + *family = AF_SMC; + return true; +} +#endif + /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -1368,12 +1400,7 @@ int __sock_create(struct net *net, int family, int type, int protocol, family = PF_PACKET; } #if IS_ENABLED(CONFIG_SMC) - if (!kern && (family == AF_INET || family == AF_INET6) && - type == SOCK_STREAM && (protocol == IPPROTO_IP || - protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { - protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - family = AF_SMC; - } + try_tcp2smc_convert(net, &family, type, &protocol, kern); #endif err = security_socket_create(family, type, protocol, kern); -- Gitee From 080d2821496f51a2e42781398d668f3e181cbaf8 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:41:05 +0800 Subject: [PATCH 138/148] anolis: net/smc: Add TX and RX diagnosis information ANBZ: #51 This patch adds RX / TX execution and data size counters for each SMC connection which will be reported in diagnosis information. Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Dust Li --- include/uapi/linux/smc_diag.h | 6 ++++++ net/smc/smc.h | 6 ++++++ net/smc/smc_core.c | 15 +++++++++++++++ net/smc/smc_diag.c | 6 ++++++ net/smc/smc_rx.c | 2 ++ net/smc/smc_tx.c | 8 +++++++- 6 files changed, 42 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 8cb3a6fef553..182efdd3ec91 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -79,6 +79,12 @@ struct smc_diag_conninfo { struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ struct smc_diag_cursor tx_sent; /* sent cursor */ struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ + __u64 rx_cnt; /* rx counter */ + __u64 tx_cnt; /* tx counter */ + __u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + __u64 rx_bytes; /* rx size */ + __u64 tx_bytes; /* tx size */ + __u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ }; /* SMC_DIAG_LINKINFO */ diff --git a/net/smc/smc.h b/net/smc/smc.h index 0b7f39df2449..9ee5eeb600e4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -228,6 +228,12 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ + u64 rx_cnt; /* rx counter */ + u64 tx_cnt; /* tx counter */ + u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + u64 rx_bytes; /* rx size */ + u64 tx_bytes; /* tx size */ + u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termiation */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 45234b3877ef..3cd3604a1739 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1844,6 +1844,20 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } +static void smc_rx_tx_counter_init(struct smc_connection *conn) +{ + /* Initialize RX & TX diagnostic inform for each + * connection. These counters mean what smc wants + * net devices "TODO" insead of what has been "DONE" + */ + conn->rx_cnt = 0; + conn->tx_cnt = 0; + conn->tx_corked_cnt = 0; + conn->rx_bytes = 0; + conn->tx_bytes = 0; + conn->tx_corked_bytes = 0; +} + /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1928,6 +1942,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); + smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8d436e42a85b..bbe00b50b666 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,6 +136,12 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, + .rx_cnt = conn->rx_cnt, + .tx_cnt = conn->tx_cnt, + .tx_corked_cnt = conn->tx_corked_cnt, + .rx_bytes = conn->rx_bytes, + .tx_bytes = conn->tx_bytes, + .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 51e8eb2933ff..bf353c68323d 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -392,6 +392,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); + conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -441,6 +442,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); + ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index ea0ca6bcb37b..a12dde653e27 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -284,8 +284,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ - if (!smc_tx_should_cork(smc, msg)) + if (!smc_tx_should_cork(smc, msg)) { + conn->tx_bytes += copylen; + ++conn->tx_cnt; smc_tx_sndbuf_nonempty(conn); + } else { + conn->tx_corked_bytes += copylen; + ++conn->tx_corked_cnt; + } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From 4d022120d6e98e1ffd3f7c4d70e9908202e11ec4 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 2 Sep 2021 13:19:26 +0800 Subject: [PATCH 139/148] anolis: net/smc: don't call ib_req_notify_cq in the send routine ANBZ: #51 We can just call ib_req_notify_cq() when the link got ready, and rearm it after poll_cq(). Which is enough to make sure we won't miss any events. Simple sockperf test show about 20% gain in throughput test with small messages. Test command: client: smc_run sockperf tp -i $SERVER -m 14 -t 30 --tcp server: smc_run sockperf sr --tcp Without this: Summary: BandWidth is 6.504 MBps (52.034 Mbps) With this: Summary: BandWidth is 7.846 MBps (62.771 Mbps) Signed-off-by: Dust Li Acked-by: Dust Li --- net/smc/smc_ib.c | 6 ++++++ net/smc/smc_wr.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 65bf38cac7fd..8e2b1af1d291 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,6 +135,12 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; + + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 24be1d03fef9..ca179e2c86b7 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -306,8 +306,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 833fd015a280c81664e3bb2ae1d6ff119241bb95 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 22 Sep 2021 11:17:18 +0800 Subject: [PATCH 140/148] anolis: net/smc: allow different subnet communication ANBZ: #51 SMC checks prefix to ensure that peers are in the same subnet. But it is no need to check this for iWARP over ERDMA, for ERDMA can communicate each others beyound subnet. So we provide a sysctl knob allow_different_subnet to support it. Signed-off-by: Tony Lu Acked-by: Dust Li --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 14 ++++++++++---- net/smc/smc_sysctl.c | 9 +++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 60facba8cf22..f961104d9f90 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -28,5 +28,6 @@ struct netns_smc { int sysctl_wmem_default; int sysctl_rmem_default; int sysctl_tcp2smc; + int sysctl_allow_different_subnet; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2b4010c186f8..2f6d9d1f6776 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2121,6 +2121,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { + struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -2128,10 +2129,12 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + if (!net->smc.sysctl_allow_different_subnet) { + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); + } /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -3189,6 +3192,7 @@ static __net_init int smc_net_init(struct net *net) net->smc.sysctl_rmem_default = init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_allow_different_subnet = 0; } return smc_pnet_net_init(net); @@ -3197,6 +3201,7 @@ static __net_init int smc_net_init(struct net *net) static void __net_exit smc_net_exit(struct net *net) { net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_allow_different_subnet = 0; smc_pnet_net_exit(net); } @@ -3333,6 +3338,7 @@ static int __init smc_init(void) init_net.smc.sysctl_wmem_default = 256 * 1024; init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; + init_net.smc.sysctl_allow_different_subnet = 0; rc = smc_sysctl_init(); if (rc) { diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a064f76f5878..4e95de438888 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -52,6 +52,15 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "allow_different_subnet", + .data = &init_net.smc.sysctl_allow_different_subnet, + .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- Gitee From b0d63989af6a6c837db2e02634ed3bfe38d1ce43 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 16 Dec 2021 17:38:05 +0800 Subject: [PATCH 141/148] anolis: net/smc: Avoid unmapping bufs from unused links ANBZ: #264 smcr_buf_free() intends to unmap each link of link group from a specific buf_desc according to lnk->link_idx. However, if the link has already been cleared before, its lnk->link_idx is 0 and smcr_buf_unmap_link() will repeatedly try to unmap lnk[0] from a buf_desc. The wrong lnk->link_idx won't cause any problems currently because unused links has unmapped bufs from itself in smcr_link_clear(). But the wrong lnk->link_idx doesn't match the semantic, so it is better to avoid ummapping an unused link. Signed-off-by: Wen Gu Acked-by: Tony Lu --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3cd3604a1739..9bf65589ded7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1292,8 +1292,11 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + continue; smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); + } if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); -- Gitee From f2197307f88670b0bfed825e69e0e1bd48f17d4a Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 11 Feb 2022 18:12:25 +0800 Subject: [PATCH 142/148] net/smc: Add sysctl conrtol for handshake limiation see commit: net/smc: Add global configure for handshake limitation by netlink This patch just add sysctl contoler for anolis. Signed-off-by: D. Wythe --- net/smc/smc_sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 4e95de438888..6a2eea8fc4a4 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -61,6 +61,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "limit_handshake", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(bool), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- Gitee From 422fa66668c7736ac0f745c9172ba526d84da1a1 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 00:15:51 +0800 Subject: [PATCH 143/148] anolis: net/smc: Support rq flow control in smc-r link layer ANBZ: #254 This patch supports rq flow control in smc-r link layer. QPs communicating without rq flow control, in the previous version, may result in RNR (reveive not ready) error, which means when sq sends a message to the remote qp, but the remote qp's rq has no rq entities to receive the message. In RNR situation, the rdma transport layer may retransmit the messages again and again until the rq has any entities, which may lower the performance, especially in heavy traffic. Using credits to do rq flow control can avoid the occurrence of RNR. The test of redis-benchmark shows that more than 3X rps improvement in SET and more than 7X rps improvement in GET. Test command: redis-server --save "" --appendonly no --protected-mode no --io-threads 7 --io-threads-do-reads yes redis-benchmark -h 192.168.26.36 -q -t set,get -P 1 --threads 7 -n 2000000 -c 500 -d 10 Before: SET: 173325.25 requests per second, p50=2.703 msec GET: 81383.52 requests per second, p50=5.575 msec After: SET: 554323.69 requests per second, p50=0.959 msec GET: 604741.19 requests per second, p50=0.855 msec Signed-off-by: Guangguan Wang --- net/smc/af_smc.c | 12 ++++++ net/smc/smc_cdc.c | 12 +++++- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 ++ net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 ++++++++- net/smc/smc_ib.c | 6 ++- net/smc/smc_llc.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 5 +++ net/smc/smc_wr.c | 31 +++++++++++++--- net/smc/smc_wr.h | 54 ++++++++++++++++++++++++++- 11 files changed, 223 insertions(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2f6d9d1f6776..af51e519d1b2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -646,6 +646,13 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; + link->credits_enable = clc->r0.init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, clc->r0.init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); + } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -1189,6 +1196,11 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { + reason_code = SMC_CLC_DECL_CREDITSERR; + goto connect_abort; + } + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGRMB; goto connect_abort; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 5c731f27996e..84eed367699e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,25 +111,30 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; + struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; + u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + smc_host_msg_to_cdc(cdc_msg, conn, &cfed); + saved_credits = (u8)smc_wr_rx_get_credits(link); + cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -445,6 +450,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ + if (cdc->credits) + smc_wr_tx_put_credits(link, cdc->credits, true); + /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303..145ce7997e64 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,7 +47,8 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 reserved[18]; + u8 credits; /* credits synced by every cdc msg */ + u8 reserved[17]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ce27399b38b1..3180c8500c5f 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1038,9 +1038,12 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; + clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + clc->r0.init_credits = + link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 83f02f131fc0..eb4bba54d6df 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,6 +63,7 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -190,7 +191,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 reserved; + u8 init_credits; /* QP rq init credits for rq flowctrl */ __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35a85ec08919..5849a98c7f6e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,7 +21,12 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, + * SMC_WR_BUF_CNT should not be less than 2 * + * SMC_RMBS_PER_LGR_MAX, since every connection at + * least has two rq/sq credits in average, otherwise + * may result in waiting for credits in sending process. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -80,6 +85,8 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 +#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 + struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -124,6 +131,14 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ + atomic_t local_rq_credits; /* credits for local rq flowctrl */ + u8 credits_enable; /* credits enable flag, set when negotiation */ + u8 local_cr_watermark_high; /* local rq credits watermark */ + u8 peer_cr_watermark_low; /* peer rq credits watermark */ + struct work_struct credits_announce_work; /* work for credits announcement */ + unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 8e2b1af1d291..9d55173d474f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -670,10 +670,12 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND + * there are max. 2 RDMA_WRITE per 1 WR_SEND. + * RDMA_WRITE consumes send queue entities, + * without recv queue entities. */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, }, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1d8dafa1a35e..67b8b1595770 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,7 +75,8 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 reserved[8]; + u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved[7]; }; struct smc_llc_msg_add_link_cont_rt { @@ -170,6 +171,12 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_announce_credits { /* type 0x0A */ + struct smc_llc_hdr hd; + u8 credits; + u8 reserved[39]; +}; + struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -189,6 +196,7 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; + struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -748,6 +756,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } +/* send credits announce request or response */ +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force) +{ + struct smc_llc_msg_announce_credits *announce_credits; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + u8 saved_credits = 0; + + if (!link->credits_enable || + (!force && !smc_wr_rx_credits_need_announce(link))) + return 0; + + saved_credits = (u8)smc_wr_rx_get_credits(link); + if (!saved_credits) + /* maybe synced by cdc msg */ + return 0; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) { + smc_wr_rx_put_credits(link, saved_credits); + return rc; + } + + announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; + memset(announce_credits, 0, sizeof(*announce_credits)); + announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; + announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); + if (reqresp == SMC_LLC_RESP) + announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; + announce_credits->credits = saved_credits; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + if (rc) + smc_wr_rx_put_credits(link, saved_credits); + + return rc; +} + /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1010,6 +1058,13 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; + link->credits_enable = add_llc->init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, add_llc->init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); + } } /* as an SMC client, process an add link request */ @@ -1930,6 +1985,10 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); + break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2015,6 +2074,10 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); + break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2108,6 +2171,27 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +static void smc_llc_announce_credits_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, + struct smc_link, credits_announce_work); + int rc, retry = 0, agains = 0; + +again: + do { + rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); + } while ((rc == -EBUSY) && smc_link_sendable(link) && + (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); + + if (smc_wr_rx_credits_need_announce(link) && + smc_link_sendable(link) && agains <= 5 && !rc) { + agains++; + goto again; + } + + clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); +} + void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2143,6 +2227,7 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2174,6 +2259,7 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); + cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2288,6 +2374,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ANNOUNCE_CREDITS + }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 4404e52b3346..f8a14643faf4 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,6 +20,8 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) + enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -35,6 +37,7 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -86,6 +89,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ca179e2c86b7..8384c4306c7d 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -130,7 +130,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - wake_up(&link->wr_tx_wait); + if (wq_has_sleeper(&link->wr_tx_wait)) + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) @@ -173,11 +174,16 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; + + if (!smc_wr_tx_get_credit(link)) + return -EBUSY; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; + smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -283,7 +289,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - wake_up(&link->wr_tx_wait); + smc_wr_tx_put_credits(link, 1, true); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -469,6 +475,12 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } + + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); + } } } @@ -511,6 +523,8 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); return rc; } @@ -545,7 +559,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_recv_wr); } @@ -734,7 +748,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -742,7 +756,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -761,7 +775,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -884,6 +898,11 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + atomic_set(&lnk->peer_rq_credits, 0); + atomic_set(&lnk->local_rq_credits, 0); + lnk->flags = 0; + lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); + lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..8cf276215c91 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,12 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -83,6 +88,51 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } +// get one tx credit, and peer rq credits dec +static inline int smc_wr_tx_get_credit(struct smc_link *link) +{ + return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; +} + +// put tx credits, when some failures occurred after tx credits got +// or receive announce credits msgs +static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) +{ + if (link->credits_enable && credits) { + atomic_add(credits, &link->peer_rq_credits); + if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) + wake_up_nr(&link->wr_tx_wait, credits); + } +} + +// to check whether peer rq credits is lower than watermark. +static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; +} + +// get local rq credits and set credits to zero. +// may called when announcing credits +static inline int smc_wr_rx_get_credits(struct smc_link *link) +{ + return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; +} + +// called when post_recv a rqe +static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) +{ + if (link->credits_enable && credits) + atomic_add(credits, &link->local_rq_credits); +} + +// to check whether local rq credits is higher than watermark. +static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -95,6 +145,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + if (!rc) + smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From 06153438a8390b4b9a5becee2ac2590ef9691185 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 01:04:22 +0800 Subject: [PATCH 144/148] anolis: net/smc: Introduce link-related proc file ANBZ: #346 This patch introduces link-related proc files to report statistics information of SMC-R links. Signed-off-by: Guangguan Wang --- net/smc/smc_proc.c | 58 +++++++++++++++++++++++++++++++++++++++++++--- net/smc/smc_proc.h | 10 ++++---- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index 19d8cc82a7ac..106887b7b9e1 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -154,9 +154,11 @@ static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, + smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); } else { - seq_puts(seq, "- - - - - - - -\n"); + seq_puts(seq, "- - - - - - -" + " - - -\n"); } } @@ -170,7 +172,7 @@ static int smc_conn_show(struct seq_file *seq, void *v) seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); goto out; } @@ -234,6 +236,51 @@ static struct smc_proc_entry smc_proc[] = { #endif }; +extern struct smc_lgr_list smc_lgr_list; +static int proc_show_links(struct seq_file *seq, void *v) +{ + struct smc_link_group *lgr, *lg; + struct smc_link *lnk; + int i = 0, j = 0; + + seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", + "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", + "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!smc_link_usable(lnk)) + continue; + for (j = 0; j < SMC_LGR_ID_SIZE; j++) + seq_printf(seq, "%02X", lgr->id[j]); + seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", + lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, + lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, + lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, + lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, + atomic_read(&lnk->local_rq_credits), + atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, + lnk->peer_cr_watermark_low, lnk->flags); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + return 0; +} + +static int proc_open_links(struct inode *inode, struct file *file) +{ + single_open(file, proc_show_links, NULL); + return 0; +} + +static struct proc_ops link_file_ops = { +.proc_open = proc_open_links, +.proc_read = seq_read, +.proc_release = single_release, +}; + static int __net_init smc_proc_dir_init(struct net *net) { int i, rc = -ENOMEM; @@ -250,6 +297,9 @@ static int __net_init smc_proc_dir_init(struct net *net) goto err_entry; } + if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) + goto err_entry; + return 0; err_entry: @@ -265,6 +315,8 @@ static void __net_exit smc_proc_dir_exit(struct net *net) { int i; + remove_proc_entry("links", net->proc_net_smc); + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) remove_proc_entry(smc_proc[i].name, net->proc_net_smc); diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h index ec59ca03e163..faa5eaaee511 100644 --- a/net/smc/smc_proc.h +++ b/net/smc/smc_proc.h @@ -9,12 +9,14 @@ #include #include "smc.h" -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") #define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") #define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") struct smc_proc_private { struct seq_net_private p; -- Gitee From 7ff2d2bf2b3a318792023ea4fb5c52e3b7cfd9a5 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 13 Jan 2022 17:06:19 +0800 Subject: [PATCH 145/148] net/smc: Introduce smc_ib_cq to bind link and cq This patch introduces struct smc_ib_cq as a medium between smc_link and ib_cq. Every smc_link can access ib_cq from their own, and unbinds smc_link from smc_ib_device. This allows flexible mapping, prepares for multiple CQs support. Signed-off-by: Tony Lu --- net/smc/smc_core.h | 2 ++ net/smc/smc_ib.c | 86 ++++++++++++++++++++++++++++++++-------------- net/smc/smc_ib.h | 13 ++++--- net/smc/smc_wr.c | 32 ++++++++--------- 4 files changed, 88 insertions(+), 45 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5849a98c7f6e..35951baf55f9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,6 +94,8 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ + struct smc_ib_cq *smcibcq_recv; /* cq for recv */ + struct smc_ib_cq *smcibcq_send; /* cq for send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9d55173d474f..cc16377fafa7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -656,6 +656,8 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; + lnk->smcibcq_send = NULL; + lnk->smcibcq_recv = NULL; } /* create a queue pair within the protection domain for a link */ @@ -665,8 +667,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->roce_cq_send, - .recv_cq = lnk->smcibdev->roce_cq_recv, + .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, + .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -692,10 +694,13 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) + if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; - else + } else { + lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; + lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; smc_wr_remember_qp_attr(lnk); + } return rc; } @@ -812,10 +817,21 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } +static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) +{ + ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); + kfree(smcibdev->ib_cq_send); + smcibdev->ib_cq_send = NULL; + + ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); + kfree(smcibdev->ib_cq_recv); + smcibdev->ib_cq_recv = NULL; +} + long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { - .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; long rc; @@ -828,28 +844,49 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); - if (IS_ERR(smcibdev->roce_cq_send)) { - smcibdev->roce_cq_send = NULL; + smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); + if (!smcibcq_send) { + rc = -ENOMEM; + goto out; + } + smcibcq_send->smcibdev = smcibdev; + smcibcq_send->is_send = 1; + cqattr.comp_vector = 0; + smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq_send, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); + if (IS_ERR(smcibdev->ib_cq_send)) { + smcibdev->ib_cq_send = NULL; goto out; } - smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); - if (IS_ERR(smcibdev->roce_cq_recv)) { - smcibdev->roce_cq_recv = NULL; - goto err; + smcibdev->ib_cq_send = smcibcq_send; + + smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); + if (!smcibcq_recv) { + rc = -ENOMEM; + goto err_send; + } + smcibcq_recv->smcibdev = smcibdev; + cqattr.comp_vector = 1; + smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq_recv, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); + if (IS_ERR(smcibdev->ib_cq_recv)) { + smcibdev->ib_cq_recv = NULL; + goto err_recv; } + smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err: - ib_destroy_cq(smcibdev->roce_cq_send); +err_recv: + kfree(smcibcq_recv); + ib_destroy_cq(smcibcq_send->ib_cq); +err_send: + kfree(smcibcq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -861,8 +898,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - ib_destroy_cq(smcibdev->roce_cq_recv); - ib_destroy_cq(smcibdev->roce_cq_send); + smc_ib_cleanup_cq(smcibdev); smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5d8b49c57f50..9b24033e20e4 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,15 +32,20 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ +struct smc_ib_cq { /* ib_cq wrapper for smc */ + struct smc_ib_device *smcibdev; /* parent ib device */ + struct ib_cq *ib_cq; /* real ib_cq for link */ + struct tasklet_struct tasklet; /* tasklet for wr */ + bool is_send; /* send for recv cq */ +}; + struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct ib_cq *roce_cq_send; /* send completion queue */ - struct ib_cq *roce_cq_recv; /* recv completion queue */ - struct tasklet_struct send_tasklet; /* called by send cq handler */ - struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + struct smc_ib_cq *ib_cq_send; /* send completion queue */ + struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 8384c4306c7d..327dd8ee3590 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -136,7 +136,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -145,9 +145,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_send, + ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); } @@ -162,9 +162,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->send_tasklet); + tasklet_schedule(&smcibcq->tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -327,7 +327,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -371,7 +371,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -486,7 +486,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -495,9 +495,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_recv, + ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); } @@ -511,9 +511,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->recv_tasklet); + tasklet_schedule(&smcibcq->tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -842,14 +842,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->recv_tasklet); - tasklet_kill(&smcibdev->send_tasklet); + tasklet_kill(&smcibdev->ib_cq_recv->tasklet); + tasklet_kill(&smcibdev->ib_cq_send->tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From da70c7e1f35a0dcc71dcc5171f9af97061bd494e Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 13 Jan 2022 17:34:53 +0800 Subject: [PATCH 146/148] net/smc: Multiple CQs per IB devices This allows multiple CQs for one IB device, compared to one CQ now. During IB device setup, it would initialize ibdev->num_comp_vectors amount of send/recv CQs, and the corresponding tasklets, like queues for net devices. Every smc_link has their own send and recv CQs, which always assigning from the least used CQs of current IB device. Signed-off-by: Tony Lu --- net/smc/smc_ib.c | 139 +++++++++++++++++++++++++++++++---------------- net/smc/smc_ib.h | 6 +- net/smc/smc_wr.c | 18 ++++-- 3 files changed, 111 insertions(+), 52 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index cc16377fafa7..d33acd85f4c6 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -630,6 +630,36 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, + bool is_send) +{ + struct smc_ib_cq *smcibcq, *cq; + int min, i; + + if (is_send) + smcibcq = smcibdev->smcibcq_send; + else + smcibcq = smcibdev->smcibcq_recv; + + cq = smcibcq; + min = cq->load; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibcq[i].load < min) { + cq = &smcibcq[i]; + min = cq->load; + } + } + + cq->load++; + return cq; +} + +static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) +{ + smcibcq->load--; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -653,8 +683,11 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) + if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); + smc_ib_put_cq(lnk->smcibcq_send); + smc_ib_put_cq(lnk->smcibcq_recv); + } lnk->roce_qp = NULL; lnk->smcibcq_send = NULL; lnk->smcibcq_recv = NULL; @@ -663,12 +696,16 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, + true); + struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, + false); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, - .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, + .send_cq = smcibcq_send->ib_cq, + .recv_cq = smcibcq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -697,8 +734,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; - lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; + lnk->smcibcq_send = smcibcq_send; + lnk->smcibcq_recv = smcibcq_recv; smc_wr_remember_qp_attr(lnk); } return rc; @@ -819,20 +856,26 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { - ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); - kfree(smcibdev->ib_cq_send); - smcibdev->ib_cq_send = NULL; + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibdev->smcibcq_send[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); + + if (smcibdev->smcibcq_recv[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); + } - ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); - kfree(smcibdev->ib_cq_recv); - smcibdev->ib_cq_recv = NULL; + kfree(smcibdev->smcibcq_send); + kfree(smcibdev->smcibcq_recv); } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; - struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; + struct smc_ib_cq *smcibcq; + int i, num_cq_peer; long rc; mutex_lock(&smcibdev->mutex); @@ -844,49 +887,53 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); - if (!smcibcq_send) { + num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq_peer = num_cq_peer; + smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_send) { rc = -ENOMEM; - goto out; - } - smcibcq_send->smcibdev = smcibdev; - smcibcq_send->is_send = 1; - cqattr.comp_vector = 0; - smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq_send, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); - if (IS_ERR(smcibdev->ib_cq_send)) { - smcibdev->ib_cq_send = NULL; - goto out; + goto err; } - smcibdev->ib_cq_send = smcibcq_send; - - smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); - if (!smcibcq_recv) { + smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_recv) { rc = -ENOMEM; - goto err_send; + goto err; } - smcibcq_recv->smcibdev = smcibdev; - cqattr.comp_vector = 1; - smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq_recv, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); - if (IS_ERR(smcibdev->ib_cq_recv)) { - smcibdev->ib_cq_recv = NULL; - goto err_recv; + + /* initialize CQs */ + for (i = 0; i < num_cq_peer; i++) { + /* initialize send CQ */ + smcibcq = &smcibdev->smcibcq_send[i]; + smcibcq->smcibdev = smcibdev; + smcibcq->is_send = 1; + cqattr.comp_vector = i; + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; + + /* initialize recv CQ */ + smcibcq = &smcibdev->smcibcq_recv[i]; + smcibcq->smcibdev = smcibdev; + cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; } - smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err_recv: - kfree(smcibcq_recv); - ib_destroy_cq(smcibcq_send->ib_cq); -err_send: - kfree(smcibcq_send); +err: + smc_ib_cleanup_cq(smcibdev); out: mutex_unlock(&smcibdev->mutex); return rc; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 9b24033e20e4..1af83b5a2e7e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -37,6 +37,7 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ bool is_send; /* send for recv cq */ + int load; /* load of current cq */ }; struct smc_ib_device { /* ib-device infos for smc */ @@ -44,8 +45,9 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct smc_ib_cq *ib_cq_send; /* send completion queue */ - struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ + int num_cq_peer; /* num of snd/rcv cq peer */ + struct smc_ib_cq *smcibcq_send; /* send cqs */ + struct smc_ib_cq *smcibcq_recv; /* recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 327dd8ee3590..5c2d30417346 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -842,14 +842,24 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->ib_cq_recv->tasklet); - tasklet_kill(&smcibdev->ib_cq_send->tasklet); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); + tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); + } } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, + smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, + smc_wr_rx_tasklet_fn); + } } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 56f16439b47058b4d2f089214193996cb26b77b0 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 14 Feb 2022 16:31:21 +0800 Subject: [PATCH 147/148] net/smc: Keep first contact clcsock This introduces a work around for eRDMA. eRDMA reuse the first TCP tuple to create QP, and don't want to release it. But SMC will release this tuple when first contact connection is shutdown. This patch keeps the first contact connection, and delay the shutdown work to link (QP) release progress. Signed-off-by: Tony Lu --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 5 ++++- net/smc/smc.h | 1 + net/smc/smc_close.c | 12 +++++++++++- net/smc/smc_core.c | 13 +++++++++++++ net/smc/smc_core.h | 28 ++++++++++++++++++++++++++++ net/smc/smc_llc.c | 3 +++ net/smc/smc_sysctl.c | 9 +++++++++ 8 files changed, 70 insertions(+), 2 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index f961104d9f90..c6609ca1b104 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -29,5 +29,6 @@ struct netns_smc { int sysctl_rmem_default; int sysctl_tcp2smc; int sysctl_allow_different_subnet; + int sysctl_keep_first_contact_clcsock; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index af51e519d1b2..fd8f5886c422 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -361,6 +361,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_sndbuf = net->smc.sysctl_wmem_default; sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); + smc->first_contact_local = 0; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -2706,7 +2707,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock) + if (do_shutdown && smc->clcsock && !smc->first_contact_local) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; @@ -3205,6 +3206,7 @@ static __net_init int smc_net_init(struct net *net) init_net.smc.sysctl_rmem_default; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 0; + net->smc.sysctl_keep_first_contact_clcsock = 1; } return smc_pnet_net_init(net); @@ -3351,6 +3353,7 @@ static int __init smc_init(void) init_net.smc.sysctl_rmem_default = 384 * 1024; init_net.smc.sysctl_tcp2smc = 0; init_net.smc.sysctl_allow_different_subnet = 0; + init_net.smc.sysctl_keep_first_contact_clcsock = 1; rc = smc_sysctl_init(); if (rc) { diff --git a/net/smc/smc.h b/net/smc/smc.h index 9ee5eeb600e4..040c6a592c6b 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -253,6 +253,7 @@ struct smc_sock { /* smc sock container */ /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ + bool first_contact_local; struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 292e4d904ab6..34df4b4b64a2 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -23,16 +23,25 @@ /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { + struct smc_link *lnk; struct socket *tcp; if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); + /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; + lnk = smc->conn.lnk; + if (!smc->use_fallback && smc->first_contact_local && + lnk) { + smc_clcsock_put(lnk->clcsock); + goto out; + } sock_release(tcp); } +out: mutex_unlock(&smc->clcsock_release_lock); } @@ -233,7 +242,8 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk) { + if (smc->clcsock && smc->clcsock->sk && + !smc->first_contact_local) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9bf65589ded7..96abaf4ea122 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -768,6 +768,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ + smc_clcsock_hold(lnk->clcsock); lnk->link_idx = link_idx; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); @@ -822,6 +823,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smc_clcsock_put(lnk->clcsock); smc_lgr_put(lgr); /* lgr_hold above */ return rc; } @@ -914,6 +916,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); + lnk->clcsock = kzalloc(sizeof(*lnk->clcsock), GFP_KERNEL); + if (!lnk->clcsock) { + rc = -ENOMEM; + goto free_wq; + } + lnk->clcsock->sock = smc->clcsock; + refcount_set(&lnk->clcsock->refcnt, 1); rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -1253,6 +1262,7 @@ static void __smcr_link_clear(struct smc_link *lnk) smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; + smc_clcsock_put(lnk->clcsock); if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ @@ -1925,6 +1935,9 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { + /* keep this clcsock for QP reuse */ + if (net->smc.sysctl_keep_first_contact_clcsock) + smc->first_contact_local = 1; rc = smc_lgr_create(smc, ini); if (rc) goto out; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35951baf55f9..a695d5bcab3a 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -87,6 +87,32 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LINKFLAG_ANNOUNCE_PENDING 0 +struct smc_clcsock { + refcount_t refcnt; + struct socket *sock; +}; + +static inline void smc_clcsock_hold(struct smc_clcsock *clcsock) +{ + if (!clcsock) + return; + + refcount_inc(&clcsock->refcnt); +} + +static inline void smc_clcsock_put(struct smc_clcsock *clcsock) +{ + if (!clcsock) + return; + + if (refcount_dec_and_test(&clcsock->refcnt)) { + if (clcsock->sock) + sock_release(clcsock->sock); + clcsock->sock = NULL; + kfree(clcsock); + } +} + struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -167,6 +193,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + + struct smc_clcsock *clcsock; /* keep for eRDMA */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 67b8b1595770..9a5b2880e761 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1114,6 +1114,8 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; lnk_new->iw_conn_param = link->iw_conn_param; + lnk_new->clcsock = link->clcsock; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; @@ -1485,6 +1487,7 @@ int smc_llc_srv_add_link(struct smc_link *link, } lgr->lnk[lnk_idx].iw_conn_param = link->iw_conn_param; + lgr->lnk[lnk_idx].clcsock = link->clcsock; rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) goto out; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 6a2eea8fc4a4..c2ff96ffb35a 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -70,6 +70,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "keep_first_contact_clcsock", + .data = &init_net.smc.sysctl_keep_first_contact_clcsock, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- Gitee From f9c10d623c28f97e91b1fe40c88cd7e4c59f1b5a Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 21 Mar 2022 21:10:41 +0800 Subject: [PATCH 148/148] net/smc: Avoid holding lock to much --- net/smc/af_smc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fd8f5886c422..bff2e05b33a2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1057,9 +1057,13 @@ static int smc_connect_clc(struct smc_sock *smc, rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; + + release_sock(&smc->sk); /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + rc = smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, SMC_CLC_ACCEPT, CLC_WAIT_TIME); + lock_sock(&smc->sk); + return rc; } void smc_fill_gid_list(struct smc_link_group *lgr, -- Gitee