From b00d544f0146f707d8f5cee008e538905b6c0017 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:27 +0800 Subject: [PATCH 01/95] anolis: Revert "anolis: net/smc: Fix slab-out-of-bounds issue in fallback" ANBZ: #1742 This reverts commit d276af6a49875d85d547ed91ee7f75813e5617c8. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 80 +++++++++++++-------------------------------- net/smc/smc_close.c | 2 -- 2 files changed, 23 insertions(+), 59 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b59fe3958a27..d099181c65e2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -248,27 +248,11 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); -static void smc_fback_restore_callbacks(struct smc_sock *smc) -{ - struct sock *clcsk = smc->clcsock->sk; - - write_lock_bh(&clcsk->sk_callback_lock); - clcsk->sk_user_data = NULL; - - smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); - smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); - smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); - smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); - - write_unlock_bh(&clcsk->sk_callback_lock); -} - static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; - smc_fback_restore_callbacks(smc); } } @@ -779,57 +763,48 @@ static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, static void smc_fback_state_change(struct sock *clcsk) { - struct smc_sock *smc; + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); - read_lock_bh(&clcsk->sk_callback_lock); - smc = smc_clcsock_user_data(clcsk); - if (smc) - smc_fback_forward_wakeup(smc, clcsk, - smc->clcsk_state_change); - read_unlock_bh(&clcsk->sk_callback_lock); + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); } static void smc_fback_data_ready(struct sock *clcsk) { - struct smc_sock *smc; + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); - read_lock_bh(&clcsk->sk_callback_lock); - smc = smc_clcsock_user_data(clcsk); - if (smc) - smc_fback_forward_wakeup(smc, clcsk, - smc->clcsk_data_ready); - read_unlock_bh(&clcsk->sk_callback_lock); + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); } static void smc_fback_write_space(struct sock *clcsk) { - struct smc_sock *smc; + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); - read_lock_bh(&clcsk->sk_callback_lock); - smc = smc_clcsock_user_data(clcsk); - if (smc) - smc_fback_forward_wakeup(smc, clcsk, - smc->clcsk_write_space); - read_unlock_bh(&clcsk->sk_callback_lock); + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); } static void smc_fback_error_report(struct sock *clcsk) { - struct smc_sock *smc; + struct smc_sock *smc = + smc_clcsock_user_data(clcsk); - read_lock_bh(&clcsk->sk_callback_lock); - smc = smc_clcsock_user_data(clcsk); - if (smc) - smc_fback_forward_wakeup(smc, clcsk, - smc->clcsk_error_report); - read_unlock_bh(&clcsk->sk_callback_lock); + if (!smc) + return; + smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; - write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, @@ -840,8 +815,6 @@ static void smc_fback_replace_callbacks(struct smc_sock *smc) &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); - - write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) @@ -2426,20 +2399,17 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc; + struct smc_sock *lsmc = + smc_clcsock_user_data(listen_clcsock); - read_lock_bh(&listen_clcsock->sk_callback_lock); - lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) - goto out; + return; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } -out: - read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -2471,12 +2441,10 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ - write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); - write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; @@ -2491,11 +2459,9 @@ static int smc_listen(struct socket *sock, int backlog) rc = kernel_listen(smc->clcsock, backlog); if (rc) { - write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; - write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 038bcafe9a9e..c83d9ee82c08 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -216,11 +216,9 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; - write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); -- Gitee From ed7b804cd870892b7eec2abb7ba7ac71f148acdf Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:28 +0800 Subject: [PATCH 02/95] anolis: Revert "anolis: net/smc: Only save the original clcsock callback functions" ANBZ: #1742 This reverts commit 676c57893d060a0d78353c4be966096a6480550a. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 55 ++++++++++++++++----------------------------- net/smc/smc.h | 29 ------------------------ net/smc/smc_close.c | 3 +-- 3 files changed, 20 insertions(+), 67 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d099181c65e2..4056b99aaf77 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -381,7 +381,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); - smc_init_saved_callbacks(smc); return sk; } @@ -801,24 +800,9 @@ static void smc_fback_error_report(struct sock *clcsk) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); } -static void smc_fback_replace_callbacks(struct smc_sock *smc) -{ - struct sock *clcsk = smc->clcsock->sk; - - clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); - - smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, - &smc->clcsk_state_change); - smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, - &smc->clcsk_data_ready); - smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, - &smc->clcsk_write_space); - smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, - &smc->clcsk_error_report); -} - static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { + struct sock *clcsk; int rc = 0; mutex_lock(&smc->clcsock_release_lock); @@ -826,7 +810,10 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) rc = -EBADF; goto out; } + clcsk = smc->clcsock->sk; + if (smc->use_fallback) + goto out; smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -841,7 +828,18 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ - smc_fback_replace_callbacks(smc); + smc->clcsk_state_change = clcsk->sk_state_change; + smc->clcsk_data_ready = clcsk->sk_data_ready; + smc->clcsk_write_space = clcsk->sk_write_space; + smc->clcsk_error_report = clcsk->sk_error_report; + + clcsk->sk_state_change = smc_fback_state_change; + clcsk->sk_data_ready = smc_fback_data_ready; + clcsk->sk_write_space = smc_fback_write_space; + clcsk->sk_error_report = smc_fback_error_report; + + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } out: mutex_unlock(&smc->clcsock_release_lock); @@ -1621,19 +1619,6 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; - - /* if new clcsock has also inherited the fallback-specific callback - * functions, switch them back to the original ones. - */ - if (lsmc->use_fallback) { - if (lsmc->clcsk_state_change) - new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; - if (lsmc->clcsk_write_space) - new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; - if (lsmc->clcsk_error_report) - new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; - } - (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -2441,10 +2426,10 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ + smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; + smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); - smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, - smc_clcsock_data_ready, &smc->clcsk_data_ready); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; @@ -2459,9 +2444,7 @@ static int smc_listen(struct socket *sock, int backlog) rc = kernel_listen(smc->clcsock, backlog); if (rc) { - smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, - &smc->clcsk_data_ready); - smc->clcsock->sk->sk_user_data = NULL; + smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc.h b/net/smc/smc.h index 05864aeb7909..d68e6605d548 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -291,41 +291,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -static inline void smc_init_saved_callbacks(struct smc_sock *smc) -{ - smc->clcsk_state_change = NULL; - smc->clcsk_data_ready = NULL; - smc->clcsk_write_space = NULL; - smc->clcsk_error_report = NULL; -} - static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } -/* save target_cb in saved_cb, and replace target_cb with new_cb */ -static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), - void (*new_cb)(struct sock *), - void (**saved_cb)(struct sock *)) -{ - /* only save once */ - if (!*saved_cb) - *saved_cb = *target_cb; - *target_cb = new_cb; -} - -/* restore target_cb to saved_cb, and reset saved_cb to NULL */ -static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), - void (**saved_cb)(struct sock *)) -{ - if (!*saved_cb) - return; - *target_cb = *saved_cb; - *saved_cb = NULL; -} - extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index c83d9ee82c08..94c817da7a31 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -216,8 +216,7 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, - &smc->clcsk_data_ready); + smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; smc->clcsock->sk->sk_user_data = NULL; rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } -- Gitee From ae2afeb9c0c2fef3d13c13e55b103cc648127198 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:29 +0800 Subject: [PATCH 03/95] anolis: Revert "anolis: net/smc: sync err info when TCP connection is refused" ANBZ: #1742 This reverts commit c8451002f6d6f65e52479fe9222556e4156c6bb1. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4056b99aaf77..a46eca40a997 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1498,8 +1498,6 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; - else if (rc == -ECONNREFUSED) - smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ -- Gitee From 91c285adadeb33fbd687a786479e3e3b7b52430c Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:30 +0800 Subject: [PATCH 04/95] anolis: Revert "anolis: net/smc: don't req_notify until all CQEs drained" ANBZ: #1742 This reverts commit abdd5b3968edfc85f74d8374d716e76243577e2e. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 49 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index c36b7c3e1b4c..bd5a55acce5e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -148,28 +148,25 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc; + int i = 0, rc; + int polled = 0; again: + polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; for (i = 0; i < rc; i++) smc_wr_tx_process_cqe(&wc[i]); - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) + if (polled == 1) goto again; } @@ -507,28 +504,24 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int polled = 0; int rc; again: + polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (rc > 0) - smc_wr_rx_process_cqes(&wc[0], rc); - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ + if (polled == 1) { + ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_SOLICITED_MASK + | IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) break; + smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) + if (polled == 1) goto again; } -- Gitee From e7c55c223ed042f7b9c383c63affe9c1051c9dd3 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:31 +0800 Subject: [PATCH 05/95] anolis: Revert "anolis: net/smc: Introduce link dimension req & comp debug info" ANBZ: #1742 This reverts commit 9add61586142a2a10e03ef7a2eeaa3469b862861. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 2 -- net/smc/smc_core.c | 12 ------------ net/smc/smc_core.h | 10 ---------- net/smc/smc_ib.c | 10 ++-------- net/smc/smc_llc.c | 2 -- net/smc/smc_proc.c | 20 ++++---------------- net/smc/smc_tx.c | 2 -- net/smc/smc_wr.c | 24 ++++-------------------- 8 files changed, 10 insertions(+), 72 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index c469a0c67c3c..84eed367699e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -34,7 +34,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { - atomic_inc(&link->cdc_comp_cnt); diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); @@ -132,7 +131,6 @@ int smc_cdc_msg_send(struct smc_connection *conn, if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; - atomic_inc(&link->cdc_send_cnt); } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9ccf9a432c3c..453062e55b36 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -772,18 +772,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); - - atomic_set(&lnk->total_send_cnt, 0); - atomic_set(&lnk->total_comp_cnt, 0); - atomic_set(&lnk->reg_send_cnt, 0); - atomic_set(&lnk->reg_comp_cnt, 0); - atomic_set(&lnk->cdc_send_cnt, 0); - atomic_set(&lnk->cdc_comp_cnt, 0); - atomic_set(&lnk->llc_send_cnt, 0); - atomic_set(&lnk->llc_comp_cnt, 0); - atomic_set(&lnk->rdma_write_cnt, 0); - atomic_set(&lnk->bad_comp_cnt, 0); - smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!lnk->smcibdev->initialized) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 3d8954ca0af1..6aaab66cb550 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -169,16 +169,6 @@ struct smc_link { atomic_t conn_cnt; /* connections on this link */ struct socket *clcsock; /* keep for eRDMA */ - atomic_t total_send_cnt; - atomic_t total_comp_cnt; - atomic_t cdc_send_cnt; - atomic_t cdc_comp_cnt; - atomic_t llc_send_cnt; - atomic_t llc_comp_cnt; - atomic_t reg_send_cnt; - atomic_t reg_comp_cnt; - atomic_t rdma_write_cnt; - atomic_t bad_comp_cnt; }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index c98e871b54c4..d33acd85f4c6 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -870,12 +870,6 @@ static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) kfree(smcibdev->smcibcq_recv); } -static void cq_event_handler(struct ib_event *event, void *data) -{ - pr_warn("smc: event %u (%s) data %p\n", - event->event, ib_event_msg(event->event), data); -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; @@ -917,7 +911,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smcibcq->is_send = 1; cqattr.comp_vector = i; smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, cq_event_handler, + smc_wr_tx_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); if (IS_ERR(smcibcq->ib_cq)) @@ -928,7 +922,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smcibcq->smcibdev = smcibdev; cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, cq_event_handler, + smc_wr_rx_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); if (IS_ERR(smcibcq->ib_cq)) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index d323b81f6d04..5351abff3d71 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -389,8 +389,6 @@ static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend, enum ib_wc_status wc_status) { /* future work: handle wc_status error for recovery and failover */ - if (!wc_status) - atomic_inc(&link->llc_comp_cnt); } /** diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index d9c11b31c4ab..106887b7b9e1 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -243,11 +243,9 @@ static int proc_show_links(struct seq_file *seq, void *v) struct smc_link *lnk; int i = 0, j = 0; - seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s\n", + seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", - "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags", "total_send", - "total_comp", "cdc_send", "cdc_comp", "llc_send", "llc_comp", "reg_send", - "reg_comp", "bad_comp", "rdma_write"); + "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { @@ -257,24 +255,14 @@ static int proc_show_links(struct seq_file *seq, void *v) continue; for (j = 0; j < SMC_LGR_ID_SIZE; j++) seq_printf(seq, "%02X", lgr->id[j]); - seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u\n", + seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, atomic_read(&lnk->local_rq_credits), atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, - lnk->peer_cr_watermark_low, lnk->flags, - atomic_read(&lnk->total_send_cnt), - atomic_read(&lnk->total_comp_cnt), - atomic_read(&lnk->cdc_send_cnt), - atomic_read(&lnk->cdc_comp_cnt), - atomic_read(&lnk->llc_send_cnt), - atomic_read(&lnk->llc_comp_cnt), - atomic_read(&lnk->reg_send_cnt), - atomic_read(&lnk->reg_comp_cnt), - atomic_read(&lnk->bad_comp_cnt), - atomic_read(&lnk->rdma_write_cnt)); + lnk->peer_cr_watermark_low, lnk->flags); } } spin_unlock_bh(&smc_lgr_list.lock); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 20217edfb9e3..6b65d56ced19 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -367,8 +367,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); - else - atomic_inc(&link->rdma_write_cnt); return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index bd5a55acce5e..5c2d30417346 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -81,17 +81,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) u32 pnd_snd_idx; link = wc->qp->qp_context; - atomic_inc(&link->total_comp_cnt); if (wc->opcode == IB_WC_REG_MR) { - if (wc->status) { + if (wc->status) link->wr_reg_state = FAILED; - pr_warn("smc: reg mr comp failed\n"); - atomic_inc(&link->bad_comp_cnt); - } else { + else link->wr_reg_state = CONFIRMED; - atomic_inc(&link->reg_comp_cnt); - } smc_wr_wakeup_reg_wait(link); return; } @@ -99,10 +94,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); if (pnd_snd_idx == link->wr_tx_cnt) { if (link->lgr->smc_version != SMC_V2 || - link->wr_tx_v2_pend->wr_id != wc->wr_id) { - pr_warn("smc: find pending index failed\n"); + link->wr_tx_v2_pend->wr_id != wc->wr_id) return; - } link->wr_tx_v2_pend->wc_status = wc->status; memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); /* clear the full struct smc_wr_tx_pend including .priv */ @@ -121,14 +114,11 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_pends[pnd_snd_idx])); memset(&link->wr_tx_bufs[pnd_snd_idx], 0, sizeof(link->wr_tx_bufs[pnd_snd_idx])); - if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) { - pr_warn("smc: clear pending index bitmap failed\n"); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) return; - } } if (wc->status) { - atomic_inc(&link->bad_comp_cnt); if (link->lgr->smc_version == SMC_V2) { memset(link->wr_tx_v2_pend, 0, sizeof(*link->wr_tx_v2_pend)); @@ -327,8 +317,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); - } else { - atomic_inc(&link->total_send_cnt); } return rc; } @@ -345,8 +333,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); - } else { - atomic_inc(&link->total_send_cnt); } return rc; } @@ -394,8 +380,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); if (rc) return rc; - atomic_inc(&link->reg_send_cnt); - atomic_inc(&link->total_send_cnt); atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, -- Gitee From 9a24387b77067d58758e869a4dcf869d5c26c8e3 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:31 +0800 Subject: [PATCH 06/95] anolis: Revert "anolis: net/smc: Introduce rtoken validity check before sending" ANBZ: #1742 This reverts commit 0c68d402dad434b208d4f4b2abfc45f729e2f972. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_tx.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 6b65d56ced19..f305d0033f4a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -358,12 +358,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; - /* rtoken might be deleted if peer freed connection */ - if (!rdma_wr->rkey || - (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { - pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); - return -EINVAL; - } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); -- Gitee From 20eb79e52278d14c9c0a2a1a7d9503d7b79297a7 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:32 +0800 Subject: [PATCH 07/95] anolis: Revert "anolis: net/smc: Introduce a sysctl to disable {a}symmetric link group" ANBZ: #1742 This reverts commit c57a65812f9bd9e153b897bef4fcad0e80d39370. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 36 +++++++++++++++--------------------- net/smc/smc_core.c | 3 --- net/smc/smc_llc.c | 9 --------- net/smc/smc_sysctl.c | 10 ---------- 5 files changed, 15 insertions(+), 44 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 135cfa9f42c4..9c0c06c0341c 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -30,6 +30,5 @@ struct netns_smc { int sysctl_tcp2smc; int sysctl_allow_different_subnet; int sysctl_keep_first_contact_clcsock; - int sysctl_disable_multiple_link; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a46eca40a997..ecdc707fdc03 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -516,7 +516,6 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -556,22 +555,20 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - if (!net->smc.sysctl_disable_multiple_link) { - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; - } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); return 0; } @@ -1701,7 +1698,6 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -1735,10 +1731,8 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - if (!net->smc.sysctl_disable_multiple_link) { - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); - } + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 453062e55b36..a163b80aac24 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1661,9 +1661,6 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5351abff3d71..9a5b2880e761 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1085,9 +1085,6 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) rc = -ENOMEM; goto out_reject; } - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - goto out_reject; ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { @@ -1215,9 +1212,6 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - goto out; ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) @@ -1463,9 +1457,6 @@ int smc_llc_srv_add_link(struct smc_link *link, rc = -ENOMEM; goto out; } - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - goto out; /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 78f9895d649e..fb18775324ea 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -79,15 +79,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "disable_multiple_link", - .data = &init_net.smc.sysctl_disable_multiple_link, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; @@ -117,7 +108,6 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; net->smc.sysctl_keep_first_contact_clcsock = 1; - net->smc.sysctl_disable_multiple_link = 1; return 0; -- Gitee From cef9f7cce8289e9a039e867f269fa23d3c821603 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:33 +0800 Subject: [PATCH 08/95] anolis: Revert "anolis: net/smc: Keep first contact clcsock" ANBZ: #1742 This reverts commit 198a7050bff7ab6e004fe2f459ed164eb7bbac47. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 3 +-- net/smc/smc.h | 1 - net/smc/smc_close.c | 7 ++----- net/smc/smc_core.c | 6 ------ net/smc/smc_core.h | 2 -- net/smc/smc_llc.c | 3 --- net/smc/smc_sysctl.c | 10 ---------- 8 files changed, 3 insertions(+), 30 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 9c0c06c0341c..8b7de3d00625 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -29,6 +29,5 @@ struct netns_smc { int sysctl_rmem_default; int sysctl_tcp2smc; int sysctl_allow_different_subnet; - int sysctl_keep_first_contact_clcsock; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ecdc707fdc03..f05a61d77fc2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -371,7 +371,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_sndbuf = net->smc.sysctl_wmem_default; sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); - smc->keep_clcsock = 0; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -2719,7 +2718,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock && !smc->keep_clcsock) + if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; diff --git a/net/smc/smc.h b/net/smc/smc.h index d68e6605d548..9ee5eeb600e4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -253,7 +253,6 @@ struct smc_sock { /* smc sock container */ /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 94c817da7a31..676cb2333d3c 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -28,12 +28,10 @@ void smc_clcsock_release(struct smc_sock *smc) if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); - /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; - if (!smc->keep_clcsock) - sock_release(tcp); + sock_release(tcp); } mutex_unlock(&smc->clcsock_release_lock); } @@ -238,8 +236,7 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk && - !smc->keep_clcsock) { + if (smc->clcsock && smc->clcsock->sk) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a163b80aac24..9bf65589ded7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -914,7 +914,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); - lnk->clcsock = smc->clcsock; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -1250,8 +1249,6 @@ static void __smcr_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); - if (lnk->clcsock) - sock_release(lnk->clcsock); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); @@ -1928,9 +1925,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { - /* keep this clcsock for QP reuse */ - if (net->smc.sysctl_keep_first_contact_clcsock) - smc->keep_clcsock = 1; rc = smc_lgr_create(smc, ini); if (rc) goto out; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6aaab66cb550..35951baf55f9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -167,8 +167,6 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ - - struct socket *clcsock; /* keep for eRDMA */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9a5b2880e761..67b8b1595770 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1114,8 +1114,6 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; lnk_new->iw_conn_param = link->iw_conn_param; - lnk_new->clcsock = link->clcsock; - rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; @@ -1487,7 +1485,6 @@ int smc_llc_srv_add_link(struct smc_link *link, } lgr->lnk[lnk_idx].iw_conn_param = link->iw_conn_param; - lgr->lnk[lnk_idx].clcsock = link->clcsock; rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) goto out; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index fb18775324ea..676c2848d82d 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -70,15 +70,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "keep_first_contact_clcsock", - .data = &init_net.smc.sysctl_keep_first_contact_clcsock, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; @@ -107,7 +98,6 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_rmem_default = 384 * 1024; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; - net->smc.sysctl_keep_first_contact_clcsock = 1; return 0; -- Gitee From f0a3ac0400025e0a937a72dcf9216a25f5745aba Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:34 +0800 Subject: [PATCH 09/95] anolis: Revert "anolis: net/smc: Multiple CQs per IB devices" ANBZ: #1742 This reverts commit dce3bbaa433a3945096d1989d8ebbf1a0ea8d396. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 139 ++++++++++++++++------------------------------- net/smc/smc_ib.h | 6 +- net/smc/smc_wr.c | 18 ++---- 3 files changed, 52 insertions(+), 111 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d33acd85f4c6..cc16377fafa7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -630,36 +630,6 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, - bool is_send) -{ - struct smc_ib_cq *smcibcq, *cq; - int min, i; - - if (is_send) - smcibcq = smcibdev->smcibcq_send; - else - smcibcq = smcibdev->smcibcq_recv; - - cq = smcibcq; - min = cq->load; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibcq[i].load < min) { - cq = &smcibcq[i]; - min = cq->load; - } - } - - cq->load++; - return cq; -} - -static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) -{ - smcibcq->load--; -} - static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -683,11 +653,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) { + if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq_send); - smc_ib_put_cq(lnk->smcibcq_recv); - } lnk->roce_qp = NULL; lnk->smcibcq_send = NULL; lnk->smcibcq_recv = NULL; @@ -696,16 +663,12 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, - true); - struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, - false); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq_send->ib_cq, - .recv_cq = smcibcq_recv->ib_cq, + .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, + .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -734,8 +697,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = smcibcq_send; - lnk->smcibcq_recv = smcibcq_recv; + lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; + lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; smc_wr_remember_qp_attr(lnk); } return rc; @@ -856,26 +819,20 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibdev->smcibcq_send[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); - - if (smcibdev->smcibcq_recv[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); - } + ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); + kfree(smcibdev->ib_cq_send); + smcibdev->ib_cq_send = NULL; - kfree(smcibdev->smcibcq_send); - kfree(smcibdev->smcibcq_recv); + ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); + kfree(smcibdev->ib_cq_recv); + smcibdev->ib_cq_recv = NULL; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; - struct smc_ib_cq *smcibcq; - int i, num_cq_peer; long rc; mutex_lock(&smcibdev->mutex); @@ -887,53 +844,49 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq_peer = num_cq_peer; - smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_send) { + smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); + if (!smcibcq_send) { rc = -ENOMEM; - goto err; + goto out; } - smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_recv) { - rc = -ENOMEM; - goto err; + smcibcq_send->smcibdev = smcibdev; + smcibcq_send->is_send = 1; + cqattr.comp_vector = 0; + smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq_send, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); + if (IS_ERR(smcibdev->ib_cq_send)) { + smcibdev->ib_cq_send = NULL; + goto out; } + smcibdev->ib_cq_send = smcibcq_send; - /* initialize CQs */ - for (i = 0; i < num_cq_peer; i++) { - /* initialize send CQ */ - smcibcq = &smcibdev->smcibcq_send[i]; - smcibcq->smcibdev = smcibdev; - smcibcq->is_send = 1; - cqattr.comp_vector = i; - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) - goto err; - - /* initialize recv CQ */ - smcibcq = &smcibdev->smcibcq_recv[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) - goto err; + smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); + if (!smcibcq_recv) { + rc = -ENOMEM; + goto err_send; + } + smcibcq_recv->smcibdev = smcibdev; + cqattr.comp_vector = 1; + smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq_recv, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); + if (IS_ERR(smcibdev->ib_cq_recv)) { + smcibdev->ib_cq_recv = NULL; + goto err_recv; } + smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err: - smc_ib_cleanup_cq(smcibdev); +err_recv: + kfree(smcibcq_recv); + ib_destroy_cq(smcibcq_send->ib_cq); +err_send: + kfree(smcibcq_send); out: mutex_unlock(&smcibdev->mutex); return rc; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 1af83b5a2e7e..9b24033e20e4 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -37,7 +37,6 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ bool is_send; /* send for recv cq */ - int load; /* load of current cq */ }; struct smc_ib_device { /* ib-device infos for smc */ @@ -45,9 +44,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq_peer; /* num of snd/rcv cq peer */ - struct smc_ib_cq *smcibcq_send; /* send cqs */ - struct smc_ib_cq *smcibcq_recv; /* recv cqs */ + struct smc_ib_cq *ib_cq_send; /* send completion queue */ + struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 5c2d30417346..327dd8ee3590 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -842,24 +842,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); - tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); - } + tasklet_kill(&smcibdev->ib_cq_recv->tasklet); + tasklet_kill(&smcibdev->ib_cq_send->tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, - smc_wr_tx_tasklet_fn); - tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, - smc_wr_rx_tasklet_fn); - } + tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 4eedc3f5a8401d45c07aaa42d301147ed841cc2f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:35 +0800 Subject: [PATCH 10/95] anolis: Revert "anolis: net/smc: Introduce smc_ib_cq to bind link and cq" ANBZ: #1742 This reverts commit 15194046ae30c305d343bf06a1e5ac7e273f296d. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.h | 2 -- net/smc/smc_ib.c | 86 ++++++++++++++-------------------------------- net/smc/smc_ib.h | 13 +++---- net/smc/smc_wr.c | 32 ++++++++--------- 4 files changed, 45 insertions(+), 88 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35951baf55f9..5849a98c7f6e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,8 +94,6 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq_recv; /* cq for recv */ - struct smc_ib_cq *smcibcq_send; /* cq for send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index cc16377fafa7..9d55173d474f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -656,8 +656,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; - lnk->smcibcq_send = NULL; - lnk->smcibcq_recv = NULL; } /* create a queue pair within the protection domain for a link */ @@ -667,8 +665,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, - .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -694,13 +692,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) { + if (IS_ERR(lnk->roce_qp)) lnk->roce_qp = NULL; - } else { - lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; - lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; + else smc_wr_remember_qp_attr(lnk); - } return rc; } @@ -817,21 +812,10 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } -static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) -{ - ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); - kfree(smcibdev->ib_cq_send); - smcibdev->ib_cq_send = NULL; - - ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); - kfree(smcibdev->ib_cq_recv); - smcibdev->ib_cq_recv = NULL; -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; - struct smc_ib_cq *smcibcq_send, *smcibcq_recv; + struct ib_cq_init_attr cqattr = { + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; int cqe_size_order, smc_order; long rc; @@ -844,49 +828,28 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); - if (!smcibcq_send) { - rc = -ENOMEM; - goto out; - } - smcibcq_send->smcibdev = smcibdev; - smcibcq_send->is_send = 1; - cqattr.comp_vector = 0; - smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq_send, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); - if (IS_ERR(smcibdev->ib_cq_send)) { - smcibdev->ib_cq_send = NULL; + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; goto out; } - smcibdev->ib_cq_send = smcibcq_send; - - smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); - if (!smcibcq_recv) { - rc = -ENOMEM; - goto err_send; - } - smcibcq_recv->smcibdev = smcibdev; - cqattr.comp_vector = 1; - smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq_recv, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); - if (IS_ERR(smcibdev->ib_cq_recv)) { - smcibdev->ib_cq_recv = NULL; - goto err_recv; + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; } - smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err_recv: - kfree(smcibcq_recv); - ib_destroy_cq(smcibcq_send->ib_cq); -err_send: - kfree(smcibcq_send); +err: + ib_destroy_cq(smcibdev->roce_cq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -898,7 +861,8 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - smc_ib_cleanup_cq(smcibdev); + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 9b24033e20e4..5d8b49c57f50 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,20 +32,15 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ -struct smc_ib_cq { /* ib_cq wrapper for smc */ - struct smc_ib_device *smcibdev; /* parent ib device */ - struct ib_cq *ib_cq; /* real ib_cq for link */ - struct tasklet_struct tasklet; /* tasklet for wr */ - bool is_send; /* send for recv cq */ -}; - struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct smc_ib_cq *ib_cq_send; /* send completion queue */ - struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 327dd8ee3590..8384c4306c7d 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -136,7 +136,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -145,9 +145,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, + ib_req_notify_cq(dev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); } @@ -162,9 +162,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; - tasklet_schedule(&smcibcq->tasklet); + tasklet_schedule(&dev->send_tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -327,7 +327,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -371,7 +371,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -486,7 +486,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -495,9 +495,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, + ib_req_notify_cq(dev->roce_cq_recv, IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); } @@ -511,9 +511,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; - tasklet_schedule(&smcibcq->tasklet); + tasklet_schedule(&dev->recv_tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -842,14 +842,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->ib_cq_recv->tasklet); - tasklet_kill(&smcibdev->ib_cq_send->tasklet); + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 3f780ff02d4b51756237f4d62cba690c9bcb4108 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:36 +0800 Subject: [PATCH 11/95] anolis: Revert "anolis: net/smc: Introduce link-related proc file" ANBZ: #1742 This reverts commit 0580bb35952f1d36a2e1d52252e12f6e3b11abf8. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_proc.c | 58 +++------------------------------------------- net/smc/smc_proc.h | 10 ++++---- 2 files changed, 7 insertions(+), 61 deletions(-) diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index 106887b7b9e1..19d8cc82a7ac 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -154,11 +154,9 @@ static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, - smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); + lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); } else { - seq_puts(seq, "- - - - - - -" - " - - -\n"); + seq_puts(seq, "- - - - - - - -\n"); } } @@ -172,7 +170,7 @@ static int smc_conn_show(struct seq_file *seq, void *v) seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); + "l_qp", "r_qp", "tx_cnt", "rx_cnt"); goto out; } @@ -236,51 +234,6 @@ static struct smc_proc_entry smc_proc[] = { #endif }; -extern struct smc_lgr_list smc_lgr_list; -static int proc_show_links(struct seq_file *seq, void *v) -{ - struct smc_link_group *lgr, *lg; - struct smc_link *lnk; - int i = 0, j = 0; - - seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", - "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", - "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lnk = &lgr->lnk[i]; - if (!smc_link_usable(lnk)) - continue; - for (j = 0; j < SMC_LGR_ID_SIZE; j++) - seq_printf(seq, "%02X", lgr->id[j]); - seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", - lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, - lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, - lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, - lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, - atomic_read(&lnk->local_rq_credits), - atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, - lnk->peer_cr_watermark_low, lnk->flags); - } - } - spin_unlock_bh(&smc_lgr_list.lock); - return 0; -} - -static int proc_open_links(struct inode *inode, struct file *file) -{ - single_open(file, proc_show_links, NULL); - return 0; -} - -static struct proc_ops link_file_ops = { -.proc_open = proc_open_links, -.proc_read = seq_read, -.proc_release = single_release, -}; - static int __net_init smc_proc_dir_init(struct net *net) { int i, rc = -ENOMEM; @@ -297,9 +250,6 @@ static int __net_init smc_proc_dir_init(struct net *net) goto err_entry; } - if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) - goto err_entry; - return 0; err_entry: @@ -315,8 +265,6 @@ static void __net_exit smc_proc_dir_exit(struct net *net) { int i; - remove_proc_entry("links", net->proc_net_smc); - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) remove_proc_entry(smc_proc[i].name, net->proc_net_smc); diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h index faa5eaaee511..ec59ca03e163 100644 --- a/net/smc/smc_proc.h +++ b/net/smc/smc_proc.h @@ -9,14 +9,12 @@ #include #include "smc.h" -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ - "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ - "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") #define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") #define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") struct smc_proc_private { struct seq_net_private p; -- Gitee From 06f8744836c1f4912c0da9efbd34b8bdc8cf8130 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:36 +0800 Subject: [PATCH 12/95] anolis: Revert "anolis: net/smc: Support rq flow control in smc-r link layer" ANBZ: #1742 This reverts commit 34979d87c7f77d0b8d06e73c763eab9c9b4a0f46. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 12 ------ net/smc/smc_cdc.c | 12 +----- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 -- net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 +-------- net/smc/smc_ib.c | 6 +-- net/smc/smc_llc.c | 92 +--------------------------------------------- net/smc/smc_llc.h | 5 --- net/smc/smc_wr.c | 31 +++------------- net/smc/smc_wr.h | 54 +-------------------------- 11 files changed, 15 insertions(+), 223 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f05a61d77fc2..25c27c21fc69 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -656,13 +656,6 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; - link->credits_enable = clc->r0.init_credits ? 1 : 0; - if (link->credits_enable) { - atomic_set(&link->peer_rq_credits, clc->r0.init_credits); - // set peer rq credits watermark, if less than init_credits * 2/3, - // then credit announcement is needed. - link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); - } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -1206,11 +1199,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { - if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { - reason_code = SMC_CLC_DECL_CREDITSERR; - goto connect_abort; - } - if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGRMB; goto connect_abort; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 84eed367699e..5c731f27996e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,30 +111,25 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; - struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; - u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); - cdc_msg->credits = saved_credits; + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (likely(!rc)) { + if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -450,9 +445,6 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ - if (cdc->credits) - smc_wr_tx_put_credits(link, cdc->credits, true); - /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 145ce7997e64..696cc11f2303 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,8 +47,7 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 credits; /* credits synced by every cdc msg */ - u8 reserved[17]; + u8 reserved[18]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index bd07837d21d9..f9f3f59c79de 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1040,12 +1040,9 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; - clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); - clc->r0.init_credits = - link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index eb4bba54d6df..83f02f131fc0 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,7 +63,6 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ -#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -191,7 +190,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved; __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5849a98c7f6e..35a85ec08919 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,12 +21,7 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, - * SMC_WR_BUF_CNT should not be less than 2 * - * SMC_RMBS_PER_LGR_MAX, since every connection at - * least has two rq/sq credits in average, otherwise - * may result in waiting for credits in sending process. - */ +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -85,8 +80,6 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 -#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 - struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -131,14 +124,6 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ - atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ - atomic_t local_rq_credits; /* credits for local rq flowctrl */ - u8 credits_enable; /* credits enable flag, set when negotiation */ - u8 local_cr_watermark_high; /* local rq credits watermark */ - u8 peer_cr_watermark_low; /* peer rq credits watermark */ - struct work_struct credits_announce_work; /* work for credits announcement */ - unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ - u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9d55173d474f..8e2b1af1d291 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -670,12 +670,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND. - * RDMA_WRITE consumes send queue entities, - * without recv queue entities. + * there are max. 2 RDMA_WRITE per 1 WR_SEND */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT, + .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, }, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 67b8b1595770..1d8dafa1a35e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,8 +75,7 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 init_credits; /* QP rq init credits for rq flowctrl */ - u8 reserved[7]; + u8 reserved[8]; }; struct smc_llc_msg_add_link_cont_rt { @@ -171,12 +170,6 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; -struct smc_llc_msg_announce_credits { /* type 0x0A */ - struct smc_llc_hdr hd; - u8 credits; - u8 reserved[39]; -}; - struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -196,7 +189,6 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; - struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -756,46 +748,6 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -/* send credits announce request or response */ -int smc_llc_announce_credits(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool force) -{ - struct smc_llc_msg_announce_credits *announce_credits; - struct smc_wr_tx_pend_priv *pend; - struct smc_wr_buf *wr_buf; - int rc; - u8 saved_credits = 0; - - if (!link->credits_enable || - (!force && !smc_wr_rx_credits_need_announce(link))) - return 0; - - saved_credits = (u8)smc_wr_rx_get_credits(link); - if (!saved_credits) - /* maybe synced by cdc msg */ - return 0; - - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); - if (rc) { - smc_wr_rx_put_credits(link, saved_credits); - return rc; - } - - announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; - memset(announce_credits, 0, sizeof(*announce_credits)); - announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; - announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); - if (reqresp == SMC_LLC_RESP) - announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; - announce_credits->credits = saved_credits; - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - if (rc) - smc_wr_rx_put_credits(link, saved_credits); - - return rc; -} - /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1058,13 +1010,6 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; - link->credits_enable = add_llc->init_credits ? 1 : 0; - if (link->credits_enable) { - atomic_set(&link->peer_rq_credits, add_llc->init_credits); - // set peer rq credits watermark, if less than init_credits * 2/3, - // then credit announcement is needed. - link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); - } } /* as an SMC client, process an add link request */ @@ -1985,10 +1930,6 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; - case SMC_LLC_ANNOUNCE_CREDITS: - if (smc_link_active(link)) - smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); - break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2074,10 +2015,6 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; - case SMC_LLC_ANNOUNCE_CREDITS: - if (smc_link_active(link)) - smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); - break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2171,27 +2108,6 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } -static void smc_llc_announce_credits_work(struct work_struct *work) -{ - struct smc_link *link = container_of(work, - struct smc_link, credits_announce_work); - int rc, retry = 0, agains = 0; - -again: - do { - rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); - } while ((rc == -EBUSY) && smc_link_sendable(link) && - (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); - - if (smc_wr_rx_credits_need_announce(link) && - smc_link_sendable(link) && agains <= 5 && !rc) { - agains++; - goto again; - } - - clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); -} - void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2227,7 +2143,6 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); - INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2259,7 +2174,6 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); - cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2374,10 +2288,6 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, - { - .handler = smc_llc_rx_handler, - .type = SMC_LLC_ANNOUNCE_CREDITS - }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index f8a14643faf4..4404e52b3346 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,8 +20,6 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) -#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) - enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -37,7 +35,6 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, - SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -89,8 +86,6 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); -int smc_llc_announce_credits(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 8384c4306c7d..ca179e2c86b7 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -130,8 +130,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - if (wq_has_sleeper(&link->wr_tx_wait)) - wake_up(&link->wr_tx_wait); + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) @@ -174,16 +173,11 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; - - if (!smc_wr_tx_get_credit(link)) - return -EBUSY; - for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; - smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -289,7 +283,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - smc_wr_tx_put_credits(link, 1, true); + wake_up(&link->wr_tx_wait); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -475,12 +469,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } - - if (smc_wr_rx_credits_need_announce(link) && - !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { - set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); - schedule_work(&link->credits_announce_work); - } } } @@ -523,8 +511,6 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); - // credits have already been announced to peer - atomic_set(&link->local_rq_credits, 0); return rc; } @@ -559,7 +545,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, lnk->qp_attr.cap.max_recv_wr); } @@ -748,7 +734,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -756,7 +742,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -775,7 +761,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -898,11 +884,6 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); - atomic_set(&lnk->peer_rq_credits, 0); - atomic_set(&lnk->local_rq_credits, 0); - lnk->flags = 0; - lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); - lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 8cf276215c91..a54e90a1110f 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,12 +19,7 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT - * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, - * since every connection at least has two rq/sq - * credits in average, otherwise may result in - * waiting for credits in sending process. - */ +#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -88,51 +83,6 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } -// get one tx credit, and peer rq credits dec -static inline int smc_wr_tx_get_credit(struct smc_link *link) -{ - return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; -} - -// put tx credits, when some failures occurred after tx credits got -// or receive announce credits msgs -static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) -{ - if (link->credits_enable && credits) { - atomic_add(credits, &link->peer_rq_credits); - if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) - wake_up_nr(&link->wr_tx_wait, credits); - } -} - -// to check whether peer rq credits is lower than watermark. -static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) -{ - return link->credits_enable && - atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; -} - -// get local rq credits and set credits to zero. -// may called when announcing credits -static inline int smc_wr_rx_get_credits(struct smc_link *link) -{ - return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; -} - -// called when post_recv a rqe -static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) -{ - if (link->credits_enable && credits) - atomic_add(credits, &link->local_rq_credits); -} - -// to check whether local rq credits is higher than watermark. -static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) -{ - return link->credits_enable && - atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; -} - /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -145,8 +95,6 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); - if (!rc) - smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From 08c979005f774910691a680965a70f1a094e93de Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:37 +0800 Subject: [PATCH 13/95] anolis: Revert "anolis: net/smc: Add sysctl conrtol for handshake limiation" ANBZ: #1742 This reverts commit 7f553a5d8797ddfa4bff40c349b82136467d702d. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 +- net/smc/smc_sysctl.c | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 8b7de3d00625..f961104d9f90 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,7 +19,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - int limit_smc_hs; /* constraint on handshake */ + bool limit_smc_hs; /* constraint on handshake */ struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 676c2848d82d..ae599b491b11 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -61,15 +61,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { - .procname = "limit_handshake", - .data = &init_net.smc.limit_smc_hs, - .maxlen = sizeof(init_net.smc.limit_smc_hs), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; -- Gitee From e7c116132ed5a5511c374bf35689eebf6c878106 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:38 +0800 Subject: [PATCH 14/95] anolis: Revert "anolis: net/smc: Avoid unmapping bufs from unused links" ANBZ: #1742 This reverts commit d333299d5647e210711daf4c8572477f3c16fa1d. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9bf65589ded7..3cd3604a1739 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1292,11 +1292,8 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_UNUSED) - continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - } if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); -- Gitee From 044f4ce5d1bfd83983c5de3516a47a10cf16e065 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:39 +0800 Subject: [PATCH 15/95] anolis: Revert "anolis: net/smc: allow different subnet communication" ANBZ: #1742 This reverts commit 75ba5a8f016edb3a8bd9440ebbe5b646c8ca307f. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 - net/smc/af_smc.c | 11 ++++------- net/smc/smc_sysctl.c | 10 ---------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index f961104d9f90..60facba8cf22 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -28,6 +28,5 @@ struct netns_smc { int sysctl_wmem_default; int sysctl_rmem_default; int sysctl_tcp2smc; - int sysctl_allow_different_subnet; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 25c27c21fc69..295fcaad1a7a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2131,7 +2131,6 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -2139,12 +2138,10 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - if (!net->smc.sysctl_allow_different_subnet) { - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); - } + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index ae599b491b11..9a44948a3d06 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -52,15 +52,6 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "allow_different_subnet", - .data = &init_net.smc.sysctl_allow_different_subnet, - .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; @@ -88,7 +79,6 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_wmem_default = 256 * 1024; net->smc.sysctl_rmem_default = 384 * 1024; net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 1; return 0; -- Gitee From 866faf817bba91e07dde03dbf299f7693259ead9 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:40 +0800 Subject: [PATCH 16/95] anolis: Revert "anolis: net/smc: don't call ib_req_notify_cq in the send routine" ANBZ: #1742 This reverts commit 9789418d2a0b14908ae096451514f91208c1eaa1. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 6 ------ net/smc/smc_wr.c | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 8e2b1af1d291..65bf38cac7fd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,12 +135,6 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; - - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc) - goto out; - rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ca179e2c86b7..24be1d03fef9 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -306,6 +306,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 070a355ba55c8572146c48a578773ea00a4ede7a Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:41 +0800 Subject: [PATCH 17/95] anolis: Revert "anolis: net/smc: Add TX and RX diagnosis information" ANBZ: #1742 This reverts commit 9e362b9622d81396140d59f00288968c72736c35. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/smc_diag.h | 6 ------ net/smc/smc.h | 6 ------ net/smc/smc_core.c | 15 --------------- net/smc/smc_diag.c | 6 ------ net/smc/smc_rx.c | 2 -- net/smc/smc_tx.c | 8 +------- 6 files changed, 1 insertion(+), 42 deletions(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 182efdd3ec91..8cb3a6fef553 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -79,12 +79,6 @@ struct smc_diag_conninfo { struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ struct smc_diag_cursor tx_sent; /* sent cursor */ struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ - __u64 rx_cnt; /* rx counter */ - __u64 tx_cnt; /* tx counter */ - __u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - __u64 rx_bytes; /* rx size */ - __u64 tx_bytes; /* tx size */ - __u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ }; /* SMC_DIAG_LINKINFO */ diff --git a/net/smc/smc.h b/net/smc/smc.h index 9ee5eeb600e4..0b7f39df2449 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -228,12 +228,6 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ - u64 rx_cnt; /* rx counter */ - u64 tx_cnt; /* tx counter */ - u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - u64 rx_bytes; /* rx size */ - u64 tx_bytes; /* tx size */ - u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termiation */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3cd3604a1739..45234b3877ef 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1844,20 +1844,6 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } -static void smc_rx_tx_counter_init(struct smc_connection *conn) -{ - /* Initialize RX & TX diagnostic inform for each - * connection. These counters mean what smc wants - * net devices "TODO" insead of what has been "DONE" - */ - conn->rx_cnt = 0; - conn->tx_cnt = 0; - conn->tx_corked_cnt = 0; - conn->rx_bytes = 0; - conn->tx_bytes = 0; - conn->tx_corked_bytes = 0; -} - /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1942,7 +1928,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); - smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index bbe00b50b666..8d436e42a85b 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,12 +136,6 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, - .rx_cnt = conn->rx_cnt, - .tx_cnt = conn->tx_cnt, - .tx_corked_cnt = conn->tx_corked_cnt, - .rx_bytes = conn->rx_bytes, - .tx_bytes = conn->tx_bytes, - .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index bf353c68323d..51e8eb2933ff 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -392,7 +392,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); - conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -442,7 +441,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); - ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f305d0033f4a..98ca9229fe87 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -283,14 +283,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ - if (!smc_tx_should_cork(smc, msg)) { - conn->tx_bytes += copylen; - ++conn->tx_cnt; + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); - } else { - conn->tx_corked_bytes += copylen; - ++conn->tx_corked_cnt; - } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From 6e858b46f080b80b036d9cec5632f510f236ff76 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:41 +0800 Subject: [PATCH 18/95] anolis: Revert "anolis: net/smc: Introduce TCP to SMC replacement netlink commands" ANBZ: #1742 This reverts commit 23c88d4213f78e6d05607b484ea1ca330e026c91. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 9 +- include/uapi/linux/smc.h | 3 - net/smc/Makefile | 2 +- net/smc/af_smc.c | 10 --- net/smc/smc_conv.c | 186 --------------------------------------- net/smc/smc_conv.h | 22 ----- net/smc/smc_netlink.c | 19 +--- net/smc/smc_netlink.h | 5 -- net/socket.c | 39 ++------ 9 files changed, 9 insertions(+), 286 deletions(-) delete mode 100644 net/smc/smc_conv.c delete mode 100644 net/smc/smc_conv.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 60facba8cf22..364d0e250734 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,21 +6,14 @@ struct smc_stats_rsn; struct smc_stats; -struct smc_convert { - int wlist_len; - struct mutex wlist_lock; - struct list_head wlist; - int (*smc_conv_match_rcu)(struct net *net, char *comm); -}; - struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; + bool limit_smc_hs; /* constraint on handshake */ - struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 759bcb2ff03e..3c7278c6ef5d 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -62,9 +62,6 @@ enum { SMC_NETLINK_DUMP_HS_LIMITATION, SMC_NETLINK_ENABLE_HS_LIMITATION, SMC_NETLINK_DISABLE_HS_LIMITATION, - SMC_NETLINK_ADD_TCP2SMC_WLIST, - SMC_NETLINK_DEL_TCP2SMC_WLIST, - SMC_NETLINK_GET_TCP2SMC_WLIST, }; /* SMC_GENL_FAMILY top level attributes */ diff --git a/net/smc/Makefile b/net/smc/Makefile index bd6f807ff803..956810a09da9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o smc_conv.o +smc-y += smc_tracepoint.o smc_proc.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 295fcaad1a7a..3b04a765a0dd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -54,7 +54,6 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_proc.h" -#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -3327,17 +3326,9 @@ static int __init smc_init(void) goto out_ulp; } - rc = smc_conv_init(); - if (rc) { - pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); - goto out_proc; - } - static_branch_enable(&tcp_have_smc); return 0; -out_proc: - smc_proc_exit(); out_ulp: tcp_unregister_ulp(&smc_ulp_ops); out_ib: @@ -3370,7 +3361,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); - smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c deleted file mode 100644 index e1f87d1de8a5..000000000000 --- a/net/smc/smc_conv.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include -#include "smc_netlink.h" -#include "smc_conv.h" - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *wlist_elem, *tmp; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - if (*wlist_len >= SMC_MAX_WLIST_LEN) { - mutex_unlock(wlist_lock); - return -EINVAL; - } - - list_for_each_entry(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) - goto out; - } - - wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); - if (!wlist_elem) { - mutex_unlock(wlist_lock); - return -ENOMEM; - } - - strcpy(wlist_elem->task_comm, msg); - list_add_tail_rcu(&wlist_elem->list, wlist); - ++*wlist_len; -out: - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *tmp, *nxt; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - list_for_each_entry_safe(tmp, nxt, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) { - list_del_rcu(&tmp->list); - synchronize_rcu(); - kfree(tmp); - --*wlist_len; - break; - } - } - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); - struct smc_conv_wlist_elem *tmp; - void *nlh; - - if (cb_ctx->pos[0]) - goto errmsg; - - nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &smc_gen_nl_family, NLM_F_MULTI, - SMC_NETLINK_GET_TCP2SMC_WLIST); - if (!nlh) - goto errmsg; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, - nla_total_size(strlen(tmp->task_comm) + 1), - tmp->task_comm)) { - rcu_read_unlock(); - goto errattr; - } - } - rcu_read_unlock(); - - genlmsg_end(skb, nlh); - cb_ctx->pos[0] = 1; - return skb->len; - -errattr: - genlmsg_cancel(skb, nlh); -errmsg: - return skb->len; -} - -static int smc_match_tcp2smc_wlist(struct net *net, char *comm) -{ - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_conv_wlist_elem *tmp; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, comm)) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - return -1; -} - -static int __net_init smc_net_conv_init(struct net *net) -{ - INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); - net->smc.smc_conv.wlist_len = 0; - - mutex_init(&net->smc.smc_conv.wlist_lock); - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, - smc_match_tcp2smc_wlist); - return 0; -} - -static void __net_exit smc_net_conv_exit(struct net *net) -{ - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *cur, *nxt; - struct list_head tmp_list; - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); - synchronize_rcu(); - - INIT_LIST_HEAD(&tmp_list); - - mutex_lock(wlist_lock); - list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); - *wlist_len = 0; - mutex_unlock(wlist_lock); - - list_for_each_entry_safe(cur, nxt, &tmp_list, list) { - list_del(&cur->list); - kfree(cur); - } -} - -static struct pernet_operations smc_conv_ops = { - .init = smc_net_conv_init, - .exit = smc_net_conv_exit, -}; - -int __init smc_conv_init(void) -{ - return register_pernet_subsys(&smc_conv_ops); -} - -void smc_conv_exit(void) -{ - unregister_pernet_subsys(&smc_conv_ops); -} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h deleted file mode 100644 index 1615b27feede..000000000000 --- a/net/smc/smc_conv.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef NET_SMC_SMC_CONV_H_ -#define NET_SMC_SMC_CONV_H_ -#include -#include -#include - -#define SMC_MAX_WLIST_LEN 32 - -struct smc_conv_wlist_elem { - char task_comm[TASK_COMM_LEN]; - struct list_head list; -}; - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); -int __init smc_conv_init(void); -void smc_conv_exit(void); - -#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 52dba083b70e..c5a62f6f52ba 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,7 +22,6 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" -#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -127,25 +126,9 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_hs_limitation, }, - { - .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_add_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_del_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .dumpit = smc_nl_get_tcp2smc_wlist, - }, }; -static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { - [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, +static const struct nla_policy smc_gen_nl_policy[2] = { [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index aae13737095e..e8c6c3f0e98c 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,11 +15,6 @@ #include #include -enum { - SMC_CMD_ATTR_TCP2SMC = 1, - SMC_CMD_MAX_ATTR, -}; - extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; diff --git a/net/socket.c b/net/socket.c index 3917e02b2b2f..96860a0f9330 100644 --- a/net/socket.c +++ b/net/socket.c @@ -141,38 +141,6 @@ static void sock_show_fdinfo(struct seq_file *m, struct file *f) #define sock_show_fdinfo NULL #endif -#if IS_ENABLED(CONFIG_SMC) -static bool try_tcp2smc_convert(struct net *net, int *family, int type, - int *protocol, int kern) -{ - int (*f)(struct net *n, char *c) = NULL; - - /* Only convert userspace socket */ - if (kern) - return false; - - if ((*family == AF_INET || *family == AF_INET6) && - type == SOCK_STREAM && - (*protocol == IPPROTO_IP || *protocol == IPPROTO_TCP)) { - if (net->smc.sysctl_tcp2smc) - goto convert; - - rcu_read_lock(); - f = rcu_dereference(net->smc.smc_conv.smc_conv_match_rcu); - if (f && !f(net, current->comm)) { - rcu_read_unlock(); - goto convert; - } - rcu_read_unlock(); - } - return false; -convert: - *protocol = (*family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - *family = AF_SMC; - return true; -} -#endif - /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -1400,7 +1368,12 @@ int __sock_create(struct net *net, int family, int type, int protocol, family = PF_PACKET; } #if IS_ENABLED(CONFIG_SMC) - try_tcp2smc_convert(net, &family, type, &protocol, kern); + if (!kern && (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } #endif err = security_socket_create(family, type, protocol, kern); -- Gitee From c5ae95e5956b3a20366b3cf5cd102e162a487413 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:42 +0800 Subject: [PATCH 19/95] anolis: Revert "anolis: net/smc: Introduce SMC-R-related proc files" ANBZ: #1742 This reverts commit 4239cf264dc397ff4e89029666dfe64cf56f4e11. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/net_namespace.h | 1 - include/net/smc.h | 5 +- net/smc/Makefile | 2 +- net/smc/af_smc.c | 25 +--- net/smc/smc_diag.c | 29 ++-- net/smc/smc_proc.c | 287 ------------------------------------ net/smc/smc_proc.h | 34 ----- 7 files changed, 21 insertions(+), 362 deletions(-) delete mode 100644 net/smc/smc_proc.c delete mode 100644 net/smc/smc_proc.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 220878bfe86b..76e9cce289a4 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,7 +95,6 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; - struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/smc.h b/include/net/smc.h index 743b4fe74346..e441aa97ad61 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -12,13 +12,10 @@ #define _SMC_H #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ -#define SMC_HTABLE_SHIFT 9 -#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { - unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht[SMC_HTABLE_SIZE]; + struct hlist_head ht; }; int smc_hash_sk(struct sock *sk); diff --git a/net/smc/Makefile b/net/smc/Makefile index 956810a09da9..875efcd126a2 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o +smc-y += smc_tracepoint.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3b04a765a0dd..a7d855f2dfd0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,7 +53,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -183,13 +182,11 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - write_lock_bh(&h->lock); - - head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + head = &h->ht; + write_lock_bh(&h->lock); sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock_bh(&h->lock); return 0; @@ -3232,7 +3229,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc, i; + int rc; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3302,11 +3299,8 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - - for (i = 0; i < SMC_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); - } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { @@ -3320,17 +3314,9 @@ static int __init smc_init(void) goto out_ib; } - rc = smc_proc_init(); - if (rc) { - pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); - goto out_ulp; - } - static_branch_enable(&tcp_have_smc); return 0; -out_ulp: - tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3361,7 +3347,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); - smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8d436e42a85b..25ef26b621a2 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,25 +196,24 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0, slot; + int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - - for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { - head = &prot->h.smc_hash->ht[slot]; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + head = &prot->h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; - } + num++; } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c deleted file mode 100644 index 19d8cc82a7ac..000000000000 --- a/net/smc/smc_proc.c +++ /dev/null @@ -1,287 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include "smc.h" -#include "smc_proc.h" -#include "smc_core.h" - -static void *smc_get_next(struct seq_file *seq, void *cur) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - struct net *net = seq_file_net(seq); - struct hlist_head *head; - struct sock *sk = cur; - - if (!sk) { - read_lock(&smc_hash->lock); -get_head: - head = &smc_hash->ht[sp->bucket]; - sk = sk_head(head); - sp->offset = 0; - goto get_sk; - } - ++sp->num; - ++sp->offset; - - sk = sk_next(sk); -get_sk: - sk_for_each_from(sk) { - if (!net_eq(sock_net(sk), net)) - continue; - return sk; - } - sp->offset = 0; - if (++sp->bucket < SMC_HTABLE_SIZE) - goto get_head; - - read_unlock(&smc_hash->lock); - return NULL; -} - -static void *smc_seek_last_pos(struct seq_file *seq) -{ - struct smc_proc_private *sp = seq->private; - int offset = sp->offset; - int orig_num = sp->num; - void *rc = NULL; - - if (sp->bucket >= SMC_HTABLE_SIZE) - goto out; - - rc = smc_get_next(seq, NULL); - while (offset-- && rc) - rc = smc_get_next(seq, rc); - - if (rc) - goto out; - - sp->bucket = 0; -out: - sp->num = orig_num; - return rc; -} - -static void *smc_get_idx(struct seq_file *seq, loff_t pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - sp->bucket = 0; - rc = smc_get_next(seq, NULL); - - while (rc && pos) { - rc = smc_get_next(seq, rc); - --pos; - } - return rc; -} - -static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - if (*pos && *pos == sp->last_pos) { - rc = smc_seek_last_pos(seq); - if (rc) - goto out; - } - - sp->num = 0; - sp->bucket = 0; - sp->offset = 0; - sp->protocol = protocol; - rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - -out: - sp->last_pos = *pos; - return rc; -} - -static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC); -} - -static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC6); -} - -static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) -{ - struct smc_proc_private *sp = seq->private; - const struct in6_addr *dest, *src; - struct smc_link_group *lgr; - struct socket *clcsock; - struct smc_link *lnk; - struct sock *sk; - bool fb = false; - int i; - - fb = smc->use_fallback; - clcsock = smc->clcsock; - sk = &smc->sk; - - if (protocol == SMCPROTO_SMC) - seq_printf(seq, CONN4_ADDR_FM, sp->num, - clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, - clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); - else if (protocol == SMCPROTO_SMC6) { - dest = &clcsock->sk->sk_v6_daddr; - src = &clcsock->sk->sk_v6_rcv_saddr; - seq_printf(seq, CONN6_ADDR_FM, sp->num, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); - } - - seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, - sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); - - lgr = smc->conn.lgr; - lnk = smc->conn.lnk; - - if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { - for (i = 0; i < SMC_LGR_ID_SIZE; i++) - seq_printf(seq, "%02X", lgr->id[i]); - - seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', - lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); - } else { - seq_puts(seq, "- - - - - - - -\n"); - } -} - -static int smc_conn_show(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct socket *clcsock; - struct smc_sock *smc; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, - "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", - "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); - goto out; - } - - smc = smc_sk(v); - clcsock = smc->clcsock; - if (!clcsock) - goto out; - - _conn_show(seq, smc, sp->protocol); -out: - return 0; -} - -static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc = NULL; - - if (v == SEQ_START_TOKEN) { - rc = smc_get_idx(seq, 0); - goto out; - } - rc = smc_get_next(seq, v); -out: - ++*pos; - sp->last_pos = *pos; - return rc; -} - -static void smc_conn_stop(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - - if (v && v != SEQ_START_TOKEN) - read_unlock(&smc_hash->lock); -} - -static struct smc_proc_entry smc_proc[] = { - { - .name = "smc4", - .ops = { - .show = smc_conn_show, - .start = smc_conn4_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .name = "smc6", - .ops = { - .show = smc_conn_show, - .start = smc_conn6_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#endif -}; - -static int __net_init smc_proc_dir_init(struct net *net) -{ - int i, rc = -ENOMEM; - - net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); - if (!net->proc_net_smc) - goto err; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { - if (!proc_create_net_data(smc_proc[i].name, 0444, - net->proc_net_smc, &smc_proc[i].ops, - sizeof(struct smc_proc_private), - NULL)) - goto err_entry; - } - - return 0; - -err_entry: - for (i -= 1; i >= 0; i--) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -err: - return rc; -} - -static void __net_exit smc_proc_dir_exit(struct net *net) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -} - -static struct pernet_operations smc_proc_ops = { - .init = smc_proc_dir_init, - .exit = smc_proc_dir_exit, -}; - -int __init smc_proc_init(void) -{ - return register_pernet_subsys(&smc_proc_ops); -} - -void smc_proc_exit(void) -{ - unregister_pernet_subsys(&smc_proc_ops); -} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h deleted file mode 100644 index ec59ca03e163..000000000000 --- a/net/smc/smc_proc.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _SMC_PROC_H_ -#define _SMC_PROC_H_ - -#include -#include -#include -#include -#include -#include "smc.h" - -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") -#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") - -struct smc_proc_private { - struct seq_net_private p; - int num, bucket, offset; - int protocol; - loff_t last_pos; -}; - -struct smc_proc_entry { - const char *name; - const struct seq_operations ops; -}; - -int __init smc_proc_init(void); -void smc_proc_exit(void); - -#endif -- Gitee From 7f9f67df8cc04cb242f1e67aedf788f3fe0acae5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:43 +0800 Subject: [PATCH 20/95] anolis: Revert "anolis: net/smc: Introduce sysctl tcp2smc" ANBZ: #1742 This reverts commit 0192fd29d56cae7f11b9433337ece0da7a0726c4. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 - net/smc/smc_sysctl.c | 8 -------- net/socket.c | 8 -------- 3 files changed, 17 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 364d0e250734..3ffaddd1ff12 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -20,6 +20,5 @@ struct netns_smc { unsigned int sysctl_autocorking_size; int sysctl_wmem_default; int sysctl_rmem_default; - int sysctl_tcp2smc; }; #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 9a44948a3d06..d6459ed48e5a 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -45,13 +45,6 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, - { - .procname = "tcp2smc", - .data = &init_net.smc.sysctl_tcp2smc, - .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; @@ -78,7 +71,6 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_wmem_default = 256 * 1024; net->smc.sysctl_rmem_default = 384 * 1024; - net->smc.sysctl_tcp2smc = 0; return 0; diff --git a/net/socket.c b/net/socket.c index 96860a0f9330..d52c265ad449 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1367,14 +1367,6 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } -#if IS_ENABLED(CONFIG_SMC) - if (!kern && (family == AF_INET || family == AF_INET6) && - type == SOCK_STREAM && (protocol == IPPROTO_IP || - protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { - protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - family = AF_SMC; - } -#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From d803971de61a47c804410d7a42131f21cd966c48 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:44 +0800 Subject: [PATCH 21/95] anolis: Revert "anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace" ANBZ: #1742 This reverts commit f49246c3fb66995bffcc245b45d6f7a2bdf2dc85. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/in.h | 3 --- include/uapi/linux/in6.h | 2 -- net/smc/smc.h | 4 ++++ 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 40b1e51b18c9..d1b327036ae4 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,9 +84,6 @@ enum { }; #endif -/* SMC protocol, IPv4 */ -#define SMCPROTO_SMC 0 - #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 6c21c85be0e3..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,8 +95,6 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 -/* SMC protocol, IPv6 */ -#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/smc.h b/net/smc/smc.h index 0b7f39df2449..ea0620529ebe 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -22,6 +22,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 + +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ + #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From 2f3f7cb93630c75bc6a9cb600debe1f4feeb894b Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 26 Jul 2022 22:25:45 +0800 Subject: [PATCH 22/95] anolis: Revert "anolis: net/smc: Introduce tunable sysctls for sndbuf and RMB size" ANBZ: #1742 This reverts commit d937e579588bf1e6ae5cff9299745eb155c02207. Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 -- net/smc/af_smc.c | 6 +++--- net/smc/smc_sysctl.c | 22 ---------------------- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 3ffaddd1ff12..e5389eeaf8bd 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -18,7 +18,5 @@ struct netns_smc { struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_autocorking_size; - int sysctl_wmem_default; - int sysctl_rmem_default; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a7d855f2dfd0..dd418710a72e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -364,8 +363,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - sk->sk_sndbuf = net->smc.sysctl_wmem_default; - sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -3113,6 +3110,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + out: return rc; } diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index d6459ed48e5a..cf3ab1334c00 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -16,10 +16,6 @@ #include "smc.h" #include "smc_sysctl.h" -#include "smc_core.h" - -static int min_sndbuf = SMC_BUF_MIN_SIZE; -static int min_rcvbuf = SMC_BUF_MIN_SIZE; static struct ctl_table smc_table[] = { { @@ -29,22 +25,6 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { - .procname = "wmem_default", - .data = &init_net.smc.sysctl_wmem_default, - .maxlen = sizeof(init_net.smc.sysctl_wmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &init_net.smc.sysctl_rmem_default, - .maxlen = sizeof(init_net.smc.sysctl_rmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, { } }; @@ -69,8 +49,6 @@ int __net_init smc_sysctl_net_init(struct net *net) goto err_reg; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; - net->smc.sysctl_wmem_default = 256 * 1024; - net->smc.sysctl_rmem_default = 384 * 1024; return 0; -- Gitee From 1e6aa40cca2cebee8b481dda30abfc9eda9838b6 Mon Sep 17 00:00:00 2001 From: liuyacan Date: Thu, 21 Apr 2022 17:40:27 +0800 Subject: [PATCH 23/95] net/smc: sync err code when tcp connection was refused ANBZ: #1742 commit 4e2e65e2e56c6ceb4ea1719360080c0af083229e upstream. In the current implementation, when TCP initiates a connection to an unavailable [ip,port], ECONNREFUSED will be stored in the TCP socket, but SMC will not. However, some apps (like curl) use getsockopt(,,SO_ERROR,,) to get the error information, which makes them miss the error message and behave strangely. Fixes: 50717a37db03 ("net/smc: nonblocking connect rework") Signed-off-by: liuyacan Reviewed-by: Tony Lu Acked-by: Karsten Graul Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index dd418710a72e..ede542fbc542 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1475,6 +1475,8 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; + else if (rc == -ECONNREFUSED) + smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ -- Gitee From bc3a5f90d683402406c9cc510c3f9e8e7d6311b9 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 22 Apr 2022 15:56:18 +0800 Subject: [PATCH 24/95] net/smc: Only save the original clcsock callback functions ANBZ: #1742 commit 4e2e65e2e56c6ceb4ea1719360080c0af083229e upstream. Both listen and fallback process will save the current clcsock callback functions and establish new ones. But if both of them happen, the saved callback functions will be overwritten. So this patch introduces some helpers to ensure that only save the original callback functions of clcsock. Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 55 +++++++++++++++++++++++++++++---------------- net/smc/smc.h | 29 ++++++++++++++++++++++++ net/smc/smc_close.c | 3 ++- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ede542fbc542..e2fc21b59d35 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -373,6 +373,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); return sk; } @@ -782,9 +783,24 @@ static void smc_fback_error_report(struct sock *clcsk) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); } +static void smc_fback_replace_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); +} + static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - struct sock *clcsk; int rc = 0; mutex_lock(&smc->clcsock_release_lock); @@ -792,10 +808,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) rc = -EBADF; goto out; } - clcsk = smc->clcsock->sk; - if (smc->use_fallback) - goto out; smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -810,18 +823,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_fback_state_change; - clcsk->sk_data_ready = smc_fback_data_ready; - clcsk->sk_write_space = smc_fback_write_space; - clcsk->sk_error_report = smc_fback_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_fback_replace_callbacks(smc); } out: mutex_unlock(&smc->clcsock_release_lock); @@ -1596,6 +1598,19 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -2397,10 +2412,10 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ - smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; - smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; @@ -2415,7 +2430,9 @@ static int smc_listen(struct socket *sock, int backlog) rc = kernel_listen(smc->clcsock, backlog); if (rc) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc.h b/net/smc/smc.h index ea0620529ebe..5ed765ea0c73 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -288,12 +288,41 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 676cb2333d3c..7bd1ef55b9df 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -214,7 +214,8 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } -- Gitee From 7515773fafb46999622fef5c8999715fa7d023b6 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 22 Apr 2022 15:56:19 +0800 Subject: [PATCH 25/95] net/smc: Fix slab-out-of-bounds issue in fallback ANBZ: #1742 commit 4e2e65e2e56c6ceb4ea1719360080c0af083229e upstream. syzbot reported a slab-out-of-bounds/use-after-free issue, which was caused by accessing an already freed smc sock in fallback-specific callback functions of clcsock. This patch fixes the issue by restoring fallback-specific callback functions to original ones and resetting clcsock sk_user_data to NULL before freeing smc sock. Meanwhile, this patch introduces sk_callback_lock to make the access and assignment to sk_user_data mutually exclusive. Reported-by: syzbot+b425899ed22c6943e00b@syzkaller.appspotmail.com Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Link: https://lore.kernel.org/r/00000000000013ca8105d7ae3ada@google.com/ Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 80 ++++++++++++++++++++++++++++++++------------- net/smc/smc_close.c | 2 ++ 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e2fc21b59d35..487fb0d78b1d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -243,11 +243,27 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); } } @@ -745,48 +761,57 @@ static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, static void smc_fback_state_change(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_data_ready(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_write_space(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_error_report(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; + write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, @@ -797,6 +822,8 @@ static void smc_fback_replace_callbacks(struct smc_sock *smc) &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) @@ -2370,17 +2397,20 @@ static void smc_tcp_listen_work(struct work_struct *work) static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc = - smc_clcsock_user_data(listen_clcsock); + struct smc_sock *lsmc; + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) - return; + goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -2412,10 +2442,12 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; @@ -2430,9 +2462,11 @@ static int smc_listen(struct socket *sock, int backlog) rc = kernel_listen(smc->clcsock, backlog); if (rc) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 7bd1ef55b9df..31db7438857c 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -214,9 +214,11 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); -- Gitee From b4f452fb50890e167ea3d45f0ff3716e727caf76 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 12 May 2022 11:08:20 +0800 Subject: [PATCH 26/95] net/smc: non blocking recvmsg() return -EAGAIN when no data and signal_pending ANBZ: #1742 commit f3c46e41b32b6266cf60b0985c61748f53bf1c61 upstream. Non blocking sendmsg will return -EAGAIN when any signal pending and no send space left, while non blocking recvmsg return -EINTR when signal pending and no data received. This may makes confused. As TCP returns -EAGAIN in the conditions described above. Align the behavior of smc with TCP. Fixes: 846e344eb722 ("net/smc: add receive timeout check") Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Karsten Graul Acked-by: Tony Lu Link: https://lore.kernel.org/r/20220512030820.73848-1-guangguan.wang@linux.alibaba.com Signed-off-by: Jakub Kicinski Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 51e8eb2933ff..338b9ef806e8 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -355,12 +355,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } break; } + if (!timeo) + return -EAGAIN; if (signal_pending(current)) { read_done = sock_intr_errno(timeo); break; } - if (!timeo) - return -EAGAIN; } if (!smc_rx_data_available(conn)) { -- Gitee From 145486b25e4983d998bcca2078ab11436dded056 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 13 May 2022 10:24:53 +0800 Subject: [PATCH 27/95] net/smc: align the connect behaviour with TCP ANBZ: #1742 commit 3aba103006bcc4a7472b7c9506b3bc065ffb7992 upstream. Connect with O_NONBLOCK will not be completed immediately and returns -EINPROGRESS. It is possible to use selector/poll for completion by selecting the socket for writing. After select indicates writability, a second connect function call will return 0 to indicate connected successfully as TCP does, but smc returns -EISCONN. Use socket state for smc to indicate connect state, which can help smc aligning the connect behaviour with TCP. Signed-off-by: Guangguan Wang Acked-by: Karsten Graul Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 487fb0d78b1d..1736a017cdba 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1544,9 +1544,29 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; lock_sock(sk); + switch (sock->state) { + default: + rc = -EINVAL; + goto out; + case SS_CONNECTED: + rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; + goto out; + case SS_CONNECTING: + if (sk->sk_state == SMC_ACTIVE) + goto connected; + break; + case SS_UNCONNECTED: + sock->state = SS_CONNECTING; + break; + } + switch (sk->sk_state) { default: goto out; + case SMC_CLOSED: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; @@ -1565,20 +1585,24 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out; sock_hold(&smc->sk); /* sock put in passive closing */ - if (smc->use_fallback) + if (smc->use_fallback) { + sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; + } if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; + goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; - else - rc = 0; /* success cases including fallback */ } +connected: + rc = 0; + sock->state = SS_CONNECTED; out: release_sock(sk); out_err: @@ -1693,6 +1717,7 @@ struct sock *smc_accept_dequeue(struct sock *parent, } if (new_sock) { sock_graft(new_sk, new_sock); + new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; @@ -2424,7 +2449,7 @@ static int smc_listen(struct socket *sock, int backlog) rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || - smc->connect_nonblock) + smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; @@ -2716,6 +2741,17 @@ static int smc_shutdown(struct socket *sock, int how) lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if (sk->sk_state == SMC_ACTIVE) + sock->state = SS_CONNECTED; + else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || + sk->sk_state == SMC_PEERCLOSEWAIT2 || + sk->sk_state == SMC_APPCLOSEWAIT1 || + sk->sk_state == SMC_APPCLOSEWAIT2 || + sk->sk_state == SMC_APPFINCLOSEWAIT) + sock->state = SS_DISCONNECTING; + } + rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && @@ -2729,6 +2765,7 @@ static int smc_shutdown(struct socket *sock, int how) sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; @@ -2754,6 +2791,10 @@ static int smc_shutdown(struct socket *sock, int how) /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; + if (sk->sk_state == SMC_CLOSED) + sock->state = SS_UNCONNECTED; + else + sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; @@ -3139,6 +3180,7 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, rc = -ENOBUFS; sock->ops = &smc_sock_ops; + sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; -- Gitee From f62c929873ac50d2ac40b9fb3380aeaa18c52cb1 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 16 May 2022 13:51:36 +0800 Subject: [PATCH 28/95] net/smc: send cdc msg inline if qp has sufficient inline space ANBZ: #1742 commit b632eb06973209dfac1eba3a9fbd13f0041f3e45 upstream. As cdc msg's length is 44B, cdc msgs can be sent inline in most rdma devices, which can help reducing sending latency. In my test environment, which are 2 VMs running on the same physical host and whose NICs(ConnectX-4Lx) are working on SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency. Test command: server: smc_run taskset -c 1 qperf client: smc_run taskset -c 1 qperf -oo \ msg_size:1:2K:*2 -t 30 -vu tcp_lat The results shown below: msgsize before after 1B 11.9 us 11.2 us (-0.7 us) 2B 11.7 us 11.2 us (-0.5 us) 4B 11.7 us 11.3 us (-0.4 us) 8B 11.6 us 11.2 us (-0.4 us) 16B 11.7 us 11.3 us (-0.4 us) 32B 11.7 us 11.3 us (-0.4 us) 64B 11.7 us 11.2 us (-0.5 us) 128B 11.6 us 11.2 us (-0.4 us) 256B 11.8 us 11.2 us (-0.6 us) 512B 11.8 us 11.4 us (-0.4 us) 1KB 11.9 us 11.4 us (-0.5 us) 2KB 12.1 us 11.5 us (-0.6 us) Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Tested-by: kernel test robot Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 1 + net/smc/smc_wr.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 65bf38cac7fd..d2bd88f3fed5 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -670,6 +670,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, + .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 24be1d03fef9..26f8f240d9e8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; + bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; for (i = 0; i < lnk->wr_tx_cnt; i++) { - lnk->wr_tx_sges[i].addr = + lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) : lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send_inline) + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = -- Gitee From 6c1014c71289f302ef4513eef1e30fa0a6894e76 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 16 May 2022 13:51:37 +0800 Subject: [PATCH 29/95] net/smc: rdma write inline if qp has sufficient inline space ANBZ: #1742 commit 793a7df63071eb09e5b88addf2a569d7bfd3c973 upstream. Rdma write with inline flag when sending small packages, whose length is shorter than the qp's max_inline_data, can help reducing latency. In my test environment, which are 2 VMs running on the same physical host and whose NICs(ConnectX-4Lx) are working on SR-IOV mode, qperf shows 0.5us-0.7us improvement in latency. Test command: server: smc_run taskset -c 1 qperf client: smc_run taskset -c 1 qperf -oo \ msg_size:1:2K:*2 -t 30 -vu tcp_lat The results shown below: msgsize before after 1B 11.2 us 10.6 us (-0.6 us) 2B 11.2 us 10.7 us (-0.5 us) 4B 11.3 us 10.7 us (-0.6 us) 8B 11.2 us 10.6 us (-0.6 us) 16B 11.3 us 10.7 us (-0.6 us) 32B 11.3 us 10.6 us (-0.7 us) 64B 11.2 us 11.2 us (0 us) 128B 11.2 us 11.2 us (0 us) 256B 11.2 us 11.2 us (0 us) 512B 11.4 us 11.3 us (-0.1 us) 1KB 11.4 us 11.5 us (0.1 us) 2KB 11.5 us 11.5 us (0 us) Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Tested-by: kernel test robot Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_tx.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 98ca9229fe87..805a546e8c04 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -391,12 +391,20 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { - struct ib_sge *sge = - wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; + struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; + struct ib_sge *sge = wr->wr.sg_list; + u64 base_addr = dma_addr; + + if (dst_len < link->qp_attr.cap.max_inline_data) { + base_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; + wr->wr.send_flags |= IB_SEND_INLINE; + } else { + wr->wr.send_flags &= ~IB_SEND_INLINE; + } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].addr = base_addr + src_off; sge[srcchunk].length = src_len; num_sges++; @@ -410,8 +418,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, - &wr_rdma_buf->wr_tx_rdma[dstchunk]); + rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) -- Gitee From 544f5a3bbaedc4750a3eb18ca1d8e116b0ebd78d Mon Sep 17 00:00:00 2001 From: liuyacan Date: Mon, 23 May 2022 12:57:07 +0800 Subject: [PATCH 30/95] net/smc: postpone sk_refcnt increment in connect() ANBZ: #1742 commit 75c1edf23b95a9c66923d9269d8e86e4dbde151f upstream. Same trigger condition as commit 86434744. When setsockopt runs in parallel to a connect(), and switch the socket into fallback mode. Then the sk_refcnt is incremented in smc_connect(), but its state stay in SMC_INIT (NOT SMC_ACTIVE). This cause the corresponding sk_refcnt decrement in __smc_release() will not be performed. Fixes: 86434744fedf ("net/smc: add fallback check to connect()") Signed-off-by: liuyacan Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1736a017cdba..bc82e4885261 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1584,11 +1584,11 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc && rc != -EINPROGRESS) goto out; - sock_hold(&smc->sk); /* sock put in passive closing */ if (smc->use_fallback) { sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; } + sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; -- Gitee From 11bdd999377e044b148d7fd1ff5dd3d8ee3b00df Mon Sep 17 00:00:00 2001 From: liuyacan Date: Mon, 23 May 2022 13:50:56 +0800 Subject: [PATCH 31/95] net/smc: fix listen processing for SMC-Rv2 ANBZ: #1742 commit 8c3b8dc5cc9bf6d273ebe18b16e2d6882bcfb36d upstream. In the process of checking whether RDMAv2 is available, the current implementation first sets ini->smcrv2.ib_dev_v2, and then allocates smc buf desc, but the latter may fail. Unfortunately, the caller will only check the former. In this case, a NULL pointer reference will occur in smc_clc_send_confirm_accept() when accessing conn->rmb_desc. This patch does two things: 1. Use the return code to determine whether V2 is available. 2. If the return code is NODEV, continue to check whether V1 is available. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Signed-off-by: liuyacan Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bc82e4885261..8f39dc121d9b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2118,13 +2118,13 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) return 0; } -static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_init_info *ini) +static int smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) { struct smc_clc_v2_extension *smc_v2_ext; u8 smcr_version; - int rc; + int rc = 0; if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) goto not_found; @@ -2142,26 +2142,31 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); - if (rc) { - smc_find_ism_store_rc(rc, ini); + if (rc) goto not_found; - } + if (!ini->smcrv2.uses_gateway) memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); - if (!rc) - rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); - if (!rc) - return; - ini->smcr_version = smcr_version; - smc_find_ism_store_rc(rc, ini); + if (rc) { + ini->smcr_version = smcr_version; + goto not_found; + } + rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (rc) { + ini->smcr_version = smcr_version; + goto not_found; + } + return 0; not_found: + rc = rc ?: SMC_CLC_DECL_NOSMCDEV; ini->smcr_version &= ~SMC_V2; ini->check_smcrv2 = false; + return rc; } static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, @@ -2194,6 +2199,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_init_info *ini) { int prfx_rc; + int rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); @@ -2221,14 +2227,18 @@ static int smc_listen_find_device(struct smc_sock *new_smc, return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA V2 is available */ - smc_find_rdma_v2_device_serv(new_smc, pclc, ini); - if (ini->smcrv2.ib_dev_v2) + rc = smc_find_rdma_v2_device_serv(new_smc, pclc, ini); + if (!rc) return 0; + /* skip V1 check if V2 is unavailable for non-Device reason */ + if (rc != SMC_CLC_DECL_NOSMCDEV && + rc != SMC_CLC_DECL_NOSMCRDEV && + rc != SMC_CLC_DECL_NOSMCDDEV) + return rc; + /* check if RDMA V1 is available */ if (!prfx_rc) { - int rc; - rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); smc_find_ism_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; -- Gitee From 44e41719a2093d72c107fc12b4a827de940dade5 Mon Sep 17 00:00:00 2001 From: liuyacan Date: Tue, 24 May 2022 17:02:30 +0800 Subject: [PATCH 32/95] Revert "net/smc: fix listen processing for SMC-Rv2" ANBZ: #1742 commit 9029ac03f20a5999bc5627277c6cf008ab8e23ed upstream. This reverts commit 8c3b8dc5cc9bf6d273ebe18b16e2d6882bcfb36d. Some rollback issue will be fixed in other patches in the future. Link: https://lore.kernel.org/all/20220523055056.2078994-1-liuyacan@corp.netease.com/ Fixes: 8c3b8dc5cc9b ("net/smc: fix listen processing for SMC-Rv2") Signed-off-by: liuyacan Link: https://lore.kernel.org/r/20220524090230.2140302-1-liuyacan@corp.netease.com Signed-off-by: Jakub Kicinski Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8f39dc121d9b..bc82e4885261 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2118,13 +2118,13 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) return 0; } -static int smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_init_info *ini) +static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) { struct smc_clc_v2_extension *smc_v2_ext; u8 smcr_version; - int rc = 0; + int rc; if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) goto not_found; @@ -2142,31 +2142,26 @@ static int smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); - if (rc) + if (rc) { + smc_find_ism_store_rc(rc, ini); goto not_found; - + } if (!ini->smcrv2.uses_gateway) memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); - if (rc) { - ini->smcr_version = smcr_version; - goto not_found; - } - rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); - if (rc) { - ini->smcr_version = smcr_version; - goto not_found; - } - return 0; + if (!rc) + rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (!rc) + return; + ini->smcr_version = smcr_version; + smc_find_ism_store_rc(rc, ini); not_found: - rc = rc ?: SMC_CLC_DECL_NOSMCDEV; ini->smcr_version &= ~SMC_V2; ini->check_smcrv2 = false; - return rc; } static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, @@ -2199,7 +2194,6 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_init_info *ini) { int prfx_rc; - int rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); @@ -2227,18 +2221,14 @@ static int smc_listen_find_device(struct smc_sock *new_smc, return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA V2 is available */ - rc = smc_find_rdma_v2_device_serv(new_smc, pclc, ini); - if (!rc) + smc_find_rdma_v2_device_serv(new_smc, pclc, ini); + if (ini->smcrv2.ib_dev_v2) return 0; - /* skip V1 check if V2 is unavailable for non-Device reason */ - if (rc != SMC_CLC_DECL_NOSMCDEV && - rc != SMC_CLC_DECL_NOSMCRDEV && - rc != SMC_CLC_DECL_NOSMCDDEV) - return rc; - /* check if RDMA V1 is available */ if (!prfx_rc) { + int rc; + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); smc_find_ism_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; -- Gitee From 5dae8864507a921003552a41e2f804e07371fda0 Mon Sep 17 00:00:00 2001 From: liuyacan Date: Wed, 25 May 2022 16:54:08 +0800 Subject: [PATCH 33/95] net/smc: set ini->smcrv2.ib_dev_v2 to NULL if SMC-Rv2 is unavailable ANBZ: #1742 commit b3b1a17538d3ef6a9667b2271216fd16d7678ab5 upstream. In the process of checking whether RDMAv2 is available, the current implementation first sets ini->smcrv2.ib_dev_v2, and then allocates smc buf desc and register rmb, but the latter may fail. In this case, the pointer should be reset. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Signed-off-by: liuyacan Reviewed-by: Karsten Graul Link: https://lore.kernel.org/r/20220525085408.812273-1-liuyacan@corp.netease.com Signed-off-by: Jakub Kicinski Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bc82e4885261..1e6f3f4534ce 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2161,6 +2161,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, not_found: ini->smcr_version &= ~SMC_V2; + ini->smcrv2.ib_dev_v2 = NULL; ini->check_smcrv2 = false; } -- Gitee From b65afe63d493f71bddc7af838b284e4ef3bcdc17 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Sat, 28 May 2022 14:54:57 +0800 Subject: [PATCH 34/95] net/smc: fixes for converting from "struct smc_cdc_tx_pend **" to "struct smc_wr_tx_pend_priv *" ANBZ: #1742 commit e225c9a5a74b12e9ef8516f30a3db2c7eb866ee1 upstream. "struct smc_cdc_tx_pend **" can not directly convert to "struct smc_wr_tx_pend_priv *". Fixes: 2bced6aefa3d ("net/smc: put slot when connection is killed") Signed-off-by: Guangguan Wang Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 5c731f27996e..53f63bfbaf5f 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -82,7 +82,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, /* abnormal termination */ if (!rc) smc_wr_tx_put_slot(link, - (struct smc_wr_tx_pend_priv *)pend); + (struct smc_wr_tx_pend_priv *)(*pend)); rc = -EPIPE; } return rc; -- Gitee From 3ce6414cb898ce1b9bf52e22d6e5a370c0687b80 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 14 Jul 2022 17:44:00 +0800 Subject: [PATCH 35/95] net/smc: remove redundant dma sync ops ANBZ: #1742 commit 6d52e2de6415b7a035b3e8dc4ccffd0da25bbfb9 upstream. smc_ib_sync_sg_for_cpu/device are the ops used for dma memory cache consistency. Smc sndbufs are dma buffers, where CPU writes data to it and PCIE device reads data from it. So for sndbufs, smc_ib_sync_sg_for_device is needed and smc_ib_sync_sg_for_cpu is redundant as PCIE device will not write the buffers. Smc rmbs are dma buffers, where PCIE device write data to it and CPU read data from it. So for rmbs, smc_ib_sync_sg_for_cpu is needed and smc_ib_sync_sg_for_device is redundant as CPU will not write the buffers. Signed-off-by: Guangguan Wang Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 -- net/smc/smc_core.c | 22 ---------------------- net/smc/smc_core.h | 2 -- net/smc/smc_rx.c | 2 -- net/smc/smc_tx.c | 1 - 5 files changed, 29 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1e6f3f4534ce..ff00eccd21d5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1226,7 +1226,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } - smc_rmb_sync_sg_for_device(&smc->conn); if (aclc->hdr.version > SMC_V1) { struct smc_clc_msg_accept_confirm_v2 *clc_v2 = @@ -2113,7 +2112,6 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } - smc_rmb_sync_sg_for_device(&new_smc->conn); return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 45234b3877ef..fdc2172a30c5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2306,14 +2306,6 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) return 0; } -void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) -{ - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || - !smc_link_active(conn->lnk)) - return; - smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); -} - void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || @@ -2336,20 +2328,6 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) } } -void smc_rmb_sync_sg_for_device(struct smc_connection *conn) -{ - int i; - - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) - return; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_active(&conn->lgr->lnk[i])) - continue; - smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, - DMA_FROM_DEVICE); - } -} - /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 35a85ec08919..775d1219f353 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -514,10 +514,8 @@ void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey); -void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); -void smc_rmb_sync_sg_for_device(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 338b9ef806e8..00ad004835e6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -413,7 +413,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (rc < 0) { if (!read_done) read_done = -EFAULT; - smc_rmb_sync_sg_for_device(conn); goto out; } } @@ -427,7 +426,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in recv ring buffer */ } - smc_rmb_sync_sg_for_device(conn); /* update cursors */ if (!(flags & MSG_PEEK)) { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 805a546e8c04..ca0d5f57908c 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -246,7 +246,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; - smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); -- Gitee From ce2bd5ce6b11e2f365b049173cea6673cc9c66c5 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 14 Jul 2022 17:44:01 +0800 Subject: [PATCH 36/95] net/smc: optimize for smc_sndbuf_sync_sg_for_device and smc_rmb_sync_sg_for_cpu ANBZ: #1742 commit 0ef69e788411cba2af017db731a9fc62d255e9ac upstream. Some CPU, such as Xeon, can guarantee DMA cache coherency. So it is no need to use dma sync APIs to flush cache on such CPUs. In order to avoid calling dma sync APIs on the IO path, use the dma_need_sync to check whether smc_buf_desc needs dma sync when creating smc_buf_desc. Signed-off-by: Guangguan Wang Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.c | 8 ++++++++ net/smc/smc_core.h | 1 + net/smc/smc_ib.c | 29 +++++++++++++++++++++++++++++ net/smc/smc_ib.h | 2 ++ 4 files changed, 40 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fdc2172a30c5..14bb508bfc72 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2032,6 +2032,9 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, goto free_table; } + buf_desc->is_dma_need_sync |= + smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; + /* create a new memory region for the RMB */ if (is_rmb) { rc = smc_ib_get_memory_region(lnk->roce_pd, @@ -2250,6 +2253,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { + buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ @@ -2308,6 +2312,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { + if (!conn->sndbuf_desc->is_dma_need_sync) + return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; @@ -2318,6 +2324,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; + if (!conn->rmb_desc->is_dma_need_sync) + return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 775d1219f353..c0a9ac35fd17 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -181,6 +181,7 @@ struct smc_buf_desc { /* mem region registered */ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; /* mem region mapped to lnk */ + u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ }; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d2bd88f3fed5..f604da1d8040 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -734,6 +734,29 @@ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, return 0; } +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot) +{ + struct scatterlist *sg; + unsigned int i; + bool ret = false; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { + if (!sg_dma_len(sg)) + break; + if (dma_need_sync(lnk->smcibdev->ibdev->dma_device, + sg_dma_address(sg))) { + ret = true; + goto out; + } + } + +out: + return ret; +} + /* synchronize buffer usage for cpu access */ void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, @@ -742,6 +765,9 @@ void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct scatterlist *sg; unsigned int i; + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { @@ -762,6 +788,9 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct scatterlist *sg; unsigned int i; + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5d8b49c57f50..034295676e88 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -102,6 +102,8 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot); void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -- Gitee From 29e3151be6db178716ffb165095ccf40dde1cee4 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 14 Jul 2022 17:44:02 +0800 Subject: [PATCH 37/95] net/smc: Introduce a sysctl for setting SMC-R buffer type ANBZ: #1742 commit 4bc5008e4387106215b50ae1a4ac2467455725ca upstream. This patch introduces the sysctl smcr_buf_type for setting the type of SMC-R sndbufs and RMBs. Valid values includes: - SMCR_PHYS_CONT_BUFS, which means use physically contiguous buffers for better performance and is the default value. - SMCR_VIRT_CONT_BUFS, which means use virtually contiguous buffers in case of physically contiguous memory is scarce. - SMCR_MIXED_BUFS, which means first try to use physically contiguous buffers. If not available, then use virtually contiguous buffers. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- Documentation/networking/smc-sysctl.rst | 13 +++++++++++++ include/net/netns/smc.h | 1 + net/smc/smc_core.h | 6 ++++++ net/smc/smc_sysctl.c | 13 +++++++++++++ 4 files changed, 33 insertions(+) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index c53f8c61c9e4..a93857e580b0 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -21,3 +21,16 @@ autocorking_size - INTEGER know how/when to uncork their sockets. Default: 64K + +smcr_buf_type - INTEGER + Controls which type of sndbufs and RMBs to use in later newly created + SMC-R link group. Only for SMC-R. + + Default: 0 (physically contiguous sndbufs and RMBs) + + Possible values: + + - 0 - Use physically contiguous buffers + - 1 - Use virtually contiguous buffers + - 2 - Mixed use of the two types. Try physically contiguous buffers first. + If not available, use virtually contiguous buffers then. diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index e5389eeaf8bd..2adbe2b245df 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -18,5 +18,6 @@ struct netns_smc { struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_autocorking_size; + unsigned int sysctl_smcr_buf_type; }; #endif diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c0a9ac35fd17..b80aec750987 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -218,6 +218,12 @@ enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; +enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index cf3ab1334c00..39b236f868bd 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -15,8 +15,11 @@ #include #include "smc.h" +#include "smc_core.h" #include "smc_sysctl.h" +static int two = 2; + static struct ctl_table smc_table[] = { { .procname = "autocorking_size", @@ -25,6 +28,15 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, + { + .procname = "smcr_buf_type", + .data = &init_net.smc.sysctl_smcr_buf_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { } }; @@ -49,6 +61,7 @@ int __net_init smc_sysctl_net_init(struct net *net) goto err_reg; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; return 0; -- Gitee From 1222eda88f65a47f887b67da5d5a501033b265d5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 14 Jul 2022 17:44:03 +0800 Subject: [PATCH 38/95] net/smc: Use sysctl-specified types of buffers in new link group ANBZ: #1742 commit b984f370ed5182d180f92dbf14bdf847ff6ccc04 upstream. This patch introduces a new SMC-R specific element buf_type in struct smc_link_group, for recording the value of sysctl smcr_buf_type when link group is created. New created link group will create and reuse buffers of the type specified by buf_type. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.c | 1 + net/smc/smc_core.h | 1 + 2 files changed, 2 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 14bb508bfc72..5777c2aea658 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -923,6 +923,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; + lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b80aec750987..454781d9fce4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -285,6 +285,7 @@ struct smc_link_group { /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; + enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ -- Gitee From 7d2448d0823a970221ee476b73b303f8f4f8cd2c Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 14 Jul 2022 17:44:04 +0800 Subject: [PATCH 39/95] net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R ANBZ: #1742 commit b8d199451c99b3796b840c350eb74b830c5c869b upstream. On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c qperf -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 66 ++++++++++++-- net/smc/smc_clc.c | 8 +- net/smc/smc_clc.h | 2 +- net/smc/smc_core.c | 213 +++++++++++++++++++++++++++++++-------------- net/smc/smc_core.h | 10 ++- net/smc/smc_ib.c | 15 ++-- net/smc/smc_llc.c | 33 ++++--- net/smc/smc_rx.c | 90 +++++++++++++++---- net/smc/smc_tx.c | 9 +- 9 files changed, 328 insertions(+), 118 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ff00eccd21d5..ca72a5494499 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -487,6 +487,29 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } +/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; + } + mutex_unlock(&lgr->llc_conf_mutex); + return rc; +} + /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) @@ -498,13 +521,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) return rc; /* protect against parallel smc_llc_cli_rkey_exchange() and - * parallel smcr_link_reg_rmb() + * parallel smcr_link_reg_buf() */ mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; - rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } @@ -550,8 +573,15 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; @@ -1221,8 +1251,15 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_ERR_REGRMB; + reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } @@ -1749,8 +1786,15 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); @@ -2109,8 +2153,14 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) struct smc_connection *conn = &new_smc->conn; if (!local_first) { + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + return SMC_CLC_DECL_ERR_REGBUF; } return 0; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index f9f3f59c79de..1472f31480d8 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1034,7 +1034,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { @@ -1046,8 +1046,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, break; } clc->r0.rmbe_size = conn->rmbe_size_short; - clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); if (version == SMC_V1) { clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 83f02f131fc0..5fee545c9a10 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -62,7 +62,7 @@ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5777c2aea658..114f2337ded8 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1103,34 +1103,37 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, return NULL; } -static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { + struct mutex *lock; /* lock buffer list */ int rc; - if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ mutex_lock(&lgr->llc_conf_mutex); - smc_llc_do_delete_rkey(lgr, rmb_desc); - rmb_desc->is_conf_rkey = false; + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; mutex_unlock(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } - if (rmb_desc->is_reg_err) { + if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - mutex_lock(&lgr->rmbs_lock); - list_del(&rmb_desc->list); - mutex_unlock(&lgr->rmbs_lock); + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + mutex_lock(lock); + list_del(&buf_desc->list); + mutex_unlock(lock); - smc_buf_free(lgr, true, rmb_desc); + smc_buf_free(lgr, is_rmb, buf_desc); } else { - rmb_desc->used = 0; - memset(rmb_desc->cpu_addr, 0, rmb_desc->len); + buf_desc->used = 0; + memset(buf_desc->cpu_addr, 0, buf_desc->len); } } @@ -1138,15 +1141,23 @@ static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) { - conn->sndbuf_desc->used = 0; - memset(conn->sndbuf_desc->cpu_addr, 0, conn->sndbuf_desc->len); + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + } else { + conn->sndbuf_desc->used = 0; + memset(conn->sndbuf_desc->cpu_addr, 0, + conn->sndbuf_desc->len); + } } - if (conn->rmb_desc && lgr->is_smcd) { - conn->rmb_desc->used = 0; - memset(conn->rmb_desc->cpu_addr, 0, conn->rmb_desc->len + - sizeof(struct smcd_cdc_msg)); - } else if (conn->rmb_desc) { - smcr_buf_unuse(conn->rmb_desc, lgr); + if (conn->rmb_desc) { + if (!lgr->is_smcd) { + smcr_buf_unuse(conn->rmb_desc, true, lgr); + } else { + conn->rmb_desc->used = 0; + memset(conn->rmb_desc->cpu_addr, 0, + conn->rmb_desc->len + + sizeof(struct smcd_cdc_msg)); + } } } @@ -1194,20 +1205,21 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - if (is_rmb) + if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) { - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - buf_desc->mr_rx[lnk->link_idx] = NULL; - } + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { + else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); - } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } @@ -1296,8 +1308,10 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - if (buf_desc->pages) + if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); kfree(buf_desc); } @@ -2009,26 +2023,50 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -/* map an rmb buf to a link */ +/* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - int rc; + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; - rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, buf_desc->len); + + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { + if (rc != nents) { rc = -EAGAIN; goto free_table; } @@ -2036,15 +2074,18 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, buf_desc->is_dma_need_sync |= smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; - smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + smc_ib_sync_sg_for_device(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } buf_desc->is_map_ib[lnk->link_idx] = true; return 0; @@ -2057,20 +2098,23 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, return rc; } -/* register a new rmb on IB device, +/* register a new buf on IB device, rmb or vzalloced sndbuf * must be called under lgr->llc_conf_mutex lock */ -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; - if (!rmb_desc->is_reg_mr[link->link_idx]) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->is_reg_err = true; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->is_reg_mr[link->link_idx] = true; + buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } @@ -2122,18 +2166,38 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; + /* reg all RMBs for a new link */ mutex_lock(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; - rc = smcr_link_reg_rmb(lnk, buf_desc); - if (rc) - goto out; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->rmbs_lock); + return rc; + } } } -out: mutex_unlock(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + mutex_lock(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->sndbufs_lock); + return rc; + } + } + } + mutex_unlock(&lgr->sndbufs_lock); return rc; } @@ -2147,18 +2211,39 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM); - buf_desc->order = get_order(bufsize); - buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_COMP | - __GFP_NORETRY | __GFP_ZERO, - buf_desc->order); - if (!buf_desc->pages) { - kfree(buf_desc); - return ERR_PTR(-EAGAIN); - } - buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); - buf_desc->len = bufsize; + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, @@ -2289,7 +2374,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { - smcr_buf_unuse(buf_desc, lgr); + smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 454781d9fce4..f9b7dd15479d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -169,9 +169,11 @@ struct smc_buf_desc { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf * incl. rkey provided to peer + * and lkey provided to local */ u32 order; /* allocation order */ @@ -184,6 +186,8 @@ struct smc_buf_desc { u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ }; struct { /* SMC-D */ unsigned short sba_idx; @@ -544,7 +548,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f604da1d8040..1cb600767e88 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -703,7 +703,7 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], buf_slot->sgt[link_idx].sgl, buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); @@ -715,20 +715,21 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[link_idx]) + if (buf_slot->mr[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[link_idx] = + buf_slot->mr[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[link_idx])) { + if (IS_ERR(buf_slot->mr[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[link_idx]); - buf_slot->mr_rx[link_idx] = NULL; + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) return -EINVAL; return 0; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1d8dafa1a35e..65552428e2ab 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -503,19 +503,22 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, if (smc_link_active(link) && link != send_link) { rkeyllc->rtoken[rtok_ix].link_id = link->link_id; rkeyllc->rtoken[rtok_ix].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); - rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address( - rmb_desc->sgt[link->link_idx].sgl)); + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); rtok_ix++; } } /* rkey of send_link is in rtoken[0] */ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); - rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); put_out: @@ -542,7 +545,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY; smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc)); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -612,9 +615,10 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, if (!buf_pos) break; rmb = buf_pos; - ext->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - ext->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - ext->rt[i].rmb_vaddr_new = + ext->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + ext->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + ext->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); while (buf_pos && !(buf_pos)->used) @@ -850,9 +854,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, } rmb = *buf_pos; - addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - addc_llc->rt[i].rmb_vaddr_new = + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); (*num_rkeys_todo)--; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 00ad004835e6..17c5aee7ee4f 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -145,35 +145,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd, static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct smc_sock *smc) { + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); + struct partial_page *partial; struct splice_pipe_desc spd; - struct partial_page partial; - struct smc_spd_priv *priv; - int bytes; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); if (!priv) - return -ENOMEM; - priv->len = len; - priv->smc = smc; - partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; - partial.len = len; - partial.private = (unsigned long)priv; - - spd.nr_pages_max = 1; - spd.nr_pages = 1; - spd.pages = &smc->conn.rmb_desc->pages; - spd.partial = &partial; + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; bytes = splice_to_pipe(pipe, &spd); if (bytes > 0) { sock_hold(&smc->sk); - get_page(smc->conn.rmb_desc->pages); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } atomic_add(bytes, &smc->conn.splice_pending); } + kfree(priv); + kfree(partial); + kfree(pages); return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; } static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index ca0d5f57908c..4e8377657a62 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -383,6 +383,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -395,7 +396,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, u64 base_addr = dma_addr; if (dst_len < link->qp_attr.cap.max_inline_data) { - base_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; + base_addr = virt_addr; wr->wr.send_flags |= IB_SEND_INLINE; } else { wr->wr.send_flags &= ~IB_SEND_INLINE; @@ -403,8 +404,12 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = base_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; -- Gitee From 4afa7dbcbad143716997a3fc035509fd38cf5ae4 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 14 Jul 2022 17:44:05 +0800 Subject: [PATCH 40/95] net/smc: Extend SMC-R link group netlink attribute ANBZ: #1742 commit ddefb2d205539418f3c3851a3e06fac9624f257d upstream. Extend SMC-R link group netlink attribute SMC_GEN_LGR_SMCR. Introduce SMC_NLA_LGR_R_BUF_TYPE to show the buffer type of SMC-R link group. Signed-off-by: Wen Gu Signed-off-by: David S. Miller Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/smc.h | 3 +++ net/smc/smc_core.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 3c7278c6ef5d..bb4dacca31e7 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -122,6 +122,9 @@ enum { SMC_NLA_LGR_R_CONNS_NUM, /* u32 */ SMC_NLA_LGR_R_V2_COMMON, /* nest */ SMC_NLA_LGR_R_V2, /* nest */ + SMC_NLA_LGR_R_NET_COOKIE, /* u64 */ + SMC_NLA_LGR_R_PAD, /* flag */ + SMC_NLA_LGR_R_BUF_TYPE, /* u8 */ __SMC_NLA_LGR_R_MAX, SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1 }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 114f2337ded8..0b833b73dd6f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -347,6 +347,8 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) + goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); -- Gitee From f3f7fbca9fba543fa0b1f779f971f7202389fe9f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 4 Mar 2022 13:01:48 +0800 Subject: [PATCH 41/95] anolis: net/smc: Introduce tunable sysctls for sndbuf and RMB size ANBZ: #1742 This patch introduces sysctls for SMC, and sperates {w|r}mem_default knobs from net.core and net.ipv4 to SMC. SMC connections' sndbuf and RMB are tunable with sysctl net.smc.{w|r}mem_default. Signed-off-by: Tony Lu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 ++ net/smc/af_smc.c | 6 +++--- net/smc/smc_sysctl.c | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 2adbe2b245df..b48655e2d4bc 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,5 +19,7 @@ struct netns_smc { #endif unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; + int sysctl_wmem_default; + int sysctl_rmem_default; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ca72a5494499..2a41a4c20add 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -379,6 +380,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + sk->sk_sndbuf = net->smc.sysctl_wmem_default; + sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -3254,9 +3257,6 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); - out: return rc; } diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 39b236f868bd..3e1dd2d6b827 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -17,6 +17,10 @@ #include "smc.h" #include "smc_core.h" #include "smc_sysctl.h" +#include "smc_core.h" + +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; static int two = 2; @@ -37,6 +41,22 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "wmem_default", + .data = &init_net.smc.sysctl_wmem_default, + .maxlen = sizeof(init_net.smc.sysctl_wmem_default), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &init_net.smc.sysctl_rmem_default, + .maxlen = sizeof(init_net.smc.sysctl_rmem_default), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, { } }; @@ -62,6 +82,8 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + net->smc.sysctl_wmem_default = 256 * 1024; + net->smc.sysctl_rmem_default = 384 * 1024; return 0; -- Gitee From 01e351bfbe0bd18892a9ff40b8ef1c8597d7cc08 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:15:06 +0800 Subject: [PATCH 42/95] anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace ANBZ: #1742 This patch exposes SMCPROTO_SMC and SMCPROTO_SMC6 to userspace by moving them to in.h and in6.h. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/in.h | 3 +++ include/uapi/linux/in6.h | 2 ++ net/smc/smc.h | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index d1b327036ae4..40b1e51b18c9 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,6 +84,9 @@ enum { }; #endif +/* SMC protocol, IPv4 */ +#define SMCPROTO_SMC 0 + #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 5ad396a57eb3..6c21c85be0e3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,6 +95,8 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 +/* SMC protocol, IPv6 */ +#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/smc.h b/net/smc/smc.h index 5ed765ea0c73..0f1a51ae6d15 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -22,10 +22,6 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 - -#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ -#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ - #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From a408cf46d8f39b46b71f0e5bc74f1668614f7ae9 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:16:24 +0800 Subject: [PATCH 43/95] anolis: net/smc: Introduce sysctl tcp2smc ANBZ: #1742 This patch adds sysctl 'tcp2smc' to provide a switch for replacing TCP to SMC-R when new sockets are created in a specific net namespace. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/smc_sysctl.c | 8 ++++++++ net/socket.c | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index b48655e2d4bc..29338e72cc8d 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -21,5 +21,6 @@ struct netns_smc { unsigned int sysctl_smcr_buf_type; int sysctl_wmem_default; int sysctl_rmem_default; + int sysctl_tcp2smc; }; #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 3e1dd2d6b827..e4d09da6de59 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -57,6 +57,13 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, + { + .procname = "tcp2smc", + .data = &init_net.smc.sysctl_tcp2smc, + .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -84,6 +91,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_wmem_default = 256 * 1024; net->smc.sysctl_rmem_default = 384 * 1024; + net->smc.sysctl_tcp2smc = 0; return 0; diff --git a/net/socket.c b/net/socket.c index d52c265ad449..96860a0f9330 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1367,6 +1367,14 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } +#if IS_ENABLED(CONFIG_SMC) + if (!kern && (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } +#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From 478a6ac32425826a317f66ce9d77e58fb21adbc5 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:33:42 +0800 Subject: [PATCH 44/95] anolis: net/smc: Introduce SMC-R-related proc files ANBZ: #1742 This patch introduces SMC-R proc files to report statistics information of SMC-R connections. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/net_namespace.h | 1 + include/net/smc.h | 5 +- net/smc/Makefile | 2 +- net/smc/af_smc.c | 25 +++- net/smc/smc_diag.c | 29 ++-- net/smc/smc_proc.c | 287 ++++++++++++++++++++++++++++++++++++ net/smc/smc_proc.h | 34 +++++ 7 files changed, 362 insertions(+), 21 deletions(-) create mode 100644 net/smc/smc_proc.c create mode 100644 net/smc/smc_proc.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 76e9cce289a4..220878bfe86b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,6 +95,7 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; + struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/smc.h b/include/net/smc.h index e441aa97ad61..743b4fe74346 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -12,10 +12,13 @@ #define _SMC_H #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ +#define SMC_HTABLE_SHIFT 9 +#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { + unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht; + struct hlist_head ht[SMC_HTABLE_SIZE]; }; int smc_hash_sk(struct sock *sk); diff --git a/net/smc/Makefile b/net/smc/Makefile index 875efcd126a2..956810a09da9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_proc.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2a41a4c20add..3a9f795a9b57 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,7 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -182,11 +183,13 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - head = &h->ht; - write_lock_bh(&h->lock); + + head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); return 0; @@ -3373,7 +3376,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc; + int rc, i; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3443,8 +3446,11 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); + + for (i = 0; i < SMC_HTABLE_SIZE; i++) { + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); + } rc = smc_ib_register_client(); if (rc) { @@ -3458,9 +3464,17 @@ static int __init smc_init(void) goto out_ib; } + rc = smc_proc_init(); + if (rc) { + pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); + goto out_ulp; + } + static_branch_enable(&tcp_have_smc); return 0; +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3491,6 +3505,7 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); + smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 25ef26b621a2..8d436e42a85b 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,24 +196,25 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0; + int rc = 0, num = 0, slot; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - head = &prot->h.smc_hash->ht; - if (hlist_empty(head)) - goto out; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + + for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { + head = &prot->h.smc_hash->ht[slot]; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; + num++; + } } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c new file mode 100644 index 000000000000..19d8cc82a7ac --- /dev/null +++ b/net/smc/smc_proc.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include "smc.h" +#include "smc_proc.h" +#include "smc_core.h" + +static void *smc_get_next(struct seq_file *seq, void *cur) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + struct net *net = seq_file_net(seq); + struct hlist_head *head; + struct sock *sk = cur; + + if (!sk) { + read_lock(&smc_hash->lock); +get_head: + head = &smc_hash->ht[sp->bucket]; + sk = sk_head(head); + sp->offset = 0; + goto get_sk; + } + ++sp->num; + ++sp->offset; + + sk = sk_next(sk); +get_sk: + sk_for_each_from(sk) { + if (!net_eq(sock_net(sk), net)) + continue; + return sk; + } + sp->offset = 0; + if (++sp->bucket < SMC_HTABLE_SIZE) + goto get_head; + + read_unlock(&smc_hash->lock); + return NULL; +} + +static void *smc_seek_last_pos(struct seq_file *seq) +{ + struct smc_proc_private *sp = seq->private; + int offset = sp->offset; + int orig_num = sp->num; + void *rc = NULL; + + if (sp->bucket >= SMC_HTABLE_SIZE) + goto out; + + rc = smc_get_next(seq, NULL); + while (offset-- && rc) + rc = smc_get_next(seq, rc); + + if (rc) + goto out; + + sp->bucket = 0; +out: + sp->num = orig_num; + return rc; +} + +static void *smc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + sp->bucket = 0; + rc = smc_get_next(seq, NULL); + + while (rc && pos) { + rc = smc_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + if (*pos && *pos == sp->last_pos) { + rc = smc_seek_last_pos(seq); + if (rc) + goto out; + } + + sp->num = 0; + sp->bucket = 0; + sp->offset = 0; + sp->protocol = protocol; + rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + sp->last_pos = *pos; + return rc; +} + +static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC); +} + +static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC6); +} + +static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) +{ + struct smc_proc_private *sp = seq->private; + const struct in6_addr *dest, *src; + struct smc_link_group *lgr; + struct socket *clcsock; + struct smc_link *lnk; + struct sock *sk; + bool fb = false; + int i; + + fb = smc->use_fallback; + clcsock = smc->clcsock; + sk = &smc->sk; + + if (protocol == SMCPROTO_SMC) + seq_printf(seq, CONN4_ADDR_FM, sp->num, + clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, + clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); + else if (protocol == SMCPROTO_SMC6) { + dest = &clcsock->sk->sk_v6_daddr; + src = &clcsock->sk->sk_v6_rcv_saddr; + seq_printf(seq, CONN6_ADDR_FM, sp->num, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); + } + + seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, + sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); + + lgr = smc->conn.lgr; + lnk = smc->conn.lnk; + + if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { + for (i = 0; i < SMC_LGR_ID_SIZE; i++) + seq_printf(seq, "%02X", lgr->id[i]); + + seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', + lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, + lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + } else { + seq_puts(seq, "- - - - - - - -\n"); + } +} + +static int smc_conn_show(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct socket *clcsock; + struct smc_sock *smc; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, + "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", + "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", + "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + goto out; + } + + smc = smc_sk(v); + clcsock = smc->clcsock; + if (!clcsock) + goto out; + + _conn_show(seq, smc, sp->protocol); +out: + return 0; +} + +static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc = NULL; + + if (v == SEQ_START_TOKEN) { + rc = smc_get_idx(seq, 0); + goto out; + } + rc = smc_get_next(seq, v); +out: + ++*pos; + sp->last_pos = *pos; + return rc; +} + +static void smc_conn_stop(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + + if (v && v != SEQ_START_TOKEN) + read_unlock(&smc_hash->lock); +} + +static struct smc_proc_entry smc_proc[] = { + { + .name = "smc4", + .ops = { + .show = smc_conn_show, + .start = smc_conn4_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .name = "smc6", + .ops = { + .show = smc_conn_show, + .start = smc_conn6_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#endif +}; + +static int __net_init smc_proc_dir_init(struct net *net) +{ + int i, rc = -ENOMEM; + + net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); + if (!net->proc_net_smc) + goto err; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { + if (!proc_create_net_data(smc_proc[i].name, 0444, + net->proc_net_smc, &smc_proc[i].ops, + sizeof(struct smc_proc_private), + NULL)) + goto err_entry; + } + + return 0; + +err_entry: + for (i -= 1; i >= 0; i--) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +err: + return rc; +} + +static void __net_exit smc_proc_dir_exit(struct net *net) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +} + +static struct pernet_operations smc_proc_ops = { + .init = smc_proc_dir_init, + .exit = smc_proc_dir_exit, +}; + +int __init smc_proc_init(void) +{ + return register_pernet_subsys(&smc_proc_ops); +} + +void smc_proc_exit(void) +{ + unregister_pernet_subsys(&smc_proc_ops); +} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h new file mode 100644 index 000000000000..ec59ca03e163 --- /dev/null +++ b/net/smc/smc_proc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _SMC_PROC_H_ +#define _SMC_PROC_H_ + +#include +#include +#include +#include +#include +#include "smc.h" + +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") +#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") + +struct smc_proc_private { + struct seq_net_private p; + int num, bucket, offset; + int protocol; + loff_t last_pos; +}; + +struct smc_proc_entry { + const char *name; + const struct seq_operations ops; +}; + +int __init smc_proc_init(void); +void smc_proc_exit(void); + +#endif -- Gitee From f9d94c38819648058b215ccabe445726992217b6 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:36:07 +0800 Subject: [PATCH 45/95] anolis: net/smc: Introduce TCP to SMC replacement netlink commands ANBZ: #1742 This patch introduces new SMC-R generic netlink commands SMC_NETLINK_{ ADD | DEL | GET }_TCP2SMC_WLIST to add | delete | get application-oriented TCP-to-SMC replacement white list. Comparison between Average time cost of creating or destroying 2000 TCP connections in different situations: 1) Without this patch and remove the patch which introduces TCP2SMC sysctl: Average creation time cost: 1106 us; Average destruction time cost: 6 us; 2) With this patch but not load SMC module: Average creation time cost: 1161 us; Average destruction time cost: 6 us; 3) With this patch and load SMC module: Average creation time cost: 1157 us; Average destruction time cost: 6 us; 4) With this patch, load SMC module and add 2 elements in TCP2SMC conversion white list: Average creation time cost: 1177 us; Average destruction time cost: 6 us; Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 9 +- include/uapi/linux/smc.h | 3 + net/smc/Makefile | 2 +- net/smc/af_smc.c | 10 +++ net/smc/smc_conv.c | 186 +++++++++++++++++++++++++++++++++++++++ net/smc/smc_conv.h | 22 +++++ net/smc/smc_netlink.c | 19 +++- net/smc/smc_netlink.h | 5 ++ net/socket.c | 39 ++++++-- 9 files changed, 286 insertions(+), 9 deletions(-) create mode 100644 net/smc/smc_conv.c create mode 100644 net/smc/smc_conv.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 29338e72cc8d..38589bb498c9 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,14 +6,21 @@ struct smc_stats_rsn; struct smc_stats; +struct smc_convert { + int wlist_len; + struct mutex wlist_lock; + struct list_head wlist; + int (*smc_conv_match_rcu)(struct net *net, char *comm); +}; + struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - bool limit_smc_hs; /* constraint on handshake */ + struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index bb4dacca31e7..4ec01eb8215e 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -62,6 +62,9 @@ enum { SMC_NETLINK_DUMP_HS_LIMITATION, SMC_NETLINK_ENABLE_HS_LIMITATION, SMC_NETLINK_DISABLE_HS_LIMITATION, + SMC_NETLINK_ADD_TCP2SMC_WLIST, + SMC_NETLINK_DEL_TCP2SMC_WLIST, + SMC_NETLINK_GET_TCP2SMC_WLIST, }; /* SMC_GENL_FAMILY top level attributes */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 956810a09da9..bd6f807ff803 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o +smc-y += smc_tracepoint.o smc_proc.o smc_conv.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3a9f795a9b57..664ea58aa3a0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -54,6 +54,7 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_proc.h" +#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -3470,9 +3471,17 @@ static int __init smc_init(void) goto out_ulp; } + rc = smc_conv_init(); + if (rc) { + pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); + goto out_proc; + } + static_branch_enable(&tcp_have_smc); return 0; +out_proc: + smc_proc_exit(); out_ulp: tcp_unregister_ulp(&smc_ulp_ops); out_ib: @@ -3505,6 +3514,7 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); + smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c new file mode 100644 index 000000000000..e1f87d1de8a5 --- /dev/null +++ b/net/smc/smc_conv.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include "smc_netlink.h" +#include "smc_conv.h" + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *wlist_elem, *tmp; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + if (*wlist_len >= SMC_MAX_WLIST_LEN) { + mutex_unlock(wlist_lock); + return -EINVAL; + } + + list_for_each_entry(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) + goto out; + } + + wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); + if (!wlist_elem) { + mutex_unlock(wlist_lock); + return -ENOMEM; + } + + strcpy(wlist_elem->task_comm, msg); + list_add_tail_rcu(&wlist_elem->list, wlist); + ++*wlist_len; +out: + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *tmp, *nxt; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + list_for_each_entry_safe(tmp, nxt, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) { + list_del_rcu(&tmp->list); + synchronize_rcu(); + kfree(tmp); + --*wlist_len; + break; + } + } + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_conv_wlist_elem *tmp; + void *nlh; + + if (cb_ctx->pos[0]) + goto errmsg; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_TCP2SMC_WLIST); + if (!nlh) + goto errmsg; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, + nla_total_size(strlen(tmp->task_comm) + 1), + tmp->task_comm)) { + rcu_read_unlock(); + goto errattr; + } + } + rcu_read_unlock(); + + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_match_tcp2smc_wlist(struct net *net, char *comm) +{ + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_conv_wlist_elem *tmp; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, comm)) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + return -1; +} + +static int __net_init smc_net_conv_init(struct net *net) +{ + INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); + net->smc.smc_conv.wlist_len = 0; + + mutex_init(&net->smc.smc_conv.wlist_lock); + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, + smc_match_tcp2smc_wlist); + return 0; +} + +static void __net_exit smc_net_conv_exit(struct net *net) +{ + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *cur, *nxt; + struct list_head tmp_list; + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); + synchronize_rcu(); + + INIT_LIST_HEAD(&tmp_list); + + mutex_lock(wlist_lock); + list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); + *wlist_len = 0; + mutex_unlock(wlist_lock); + + list_for_each_entry_safe(cur, nxt, &tmp_list, list) { + list_del(&cur->list); + kfree(cur); + } +} + +static struct pernet_operations smc_conv_ops = { + .init = smc_net_conv_init, + .exit = smc_net_conv_exit, +}; + +int __init smc_conv_init(void) +{ + return register_pernet_subsys(&smc_conv_ops); +} + +void smc_conv_exit(void) +{ + unregister_pernet_subsys(&smc_conv_ops); +} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h new file mode 100644 index 000000000000..1615b27feede --- /dev/null +++ b/net/smc/smc_conv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef NET_SMC_SMC_CONV_H_ +#define NET_SMC_SMC_CONV_H_ +#include +#include +#include + +#define SMC_MAX_WLIST_LEN 32 + +struct smc_conv_wlist_elem { + char task_comm[TASK_COMM_LEN]; + struct list_head list; +}; + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); +int __init smc_conv_init(void); +void smc_conv_exit(void); + +#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index c5a62f6f52ba..52dba083b70e 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,6 +22,7 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" +#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -126,9 +127,25 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_hs_limitation, }, + { + .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_add_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_del_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_tcp2smc_wlist, + }, }; -static const struct nla_policy smc_gen_nl_policy[2] = { +static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { + [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index e8c6c3f0e98c..aae13737095e 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,6 +15,11 @@ #include #include +enum { + SMC_CMD_ATTR_TCP2SMC = 1, + SMC_CMD_MAX_ATTR, +}; + extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; diff --git a/net/socket.c b/net/socket.c index 96860a0f9330..3917e02b2b2f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -141,6 +141,38 @@ static void sock_show_fdinfo(struct seq_file *m, struct file *f) #define sock_show_fdinfo NULL #endif +#if IS_ENABLED(CONFIG_SMC) +static bool try_tcp2smc_convert(struct net *net, int *family, int type, + int *protocol, int kern) +{ + int (*f)(struct net *n, char *c) = NULL; + + /* Only convert userspace socket */ + if (kern) + return false; + + if ((*family == AF_INET || *family == AF_INET6) && + type == SOCK_STREAM && + (*protocol == IPPROTO_IP || *protocol == IPPROTO_TCP)) { + if (net->smc.sysctl_tcp2smc) + goto convert; + + rcu_read_lock(); + f = rcu_dereference(net->smc.smc_conv.smc_conv_match_rcu); + if (f && !f(net, current->comm)) { + rcu_read_unlock(); + goto convert; + } + rcu_read_unlock(); + } + return false; +convert: + *protocol = (*family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + *family = AF_SMC; + return true; +} +#endif + /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -1368,12 +1400,7 @@ int __sock_create(struct net *net, int family, int type, int protocol, family = PF_PACKET; } #if IS_ENABLED(CONFIG_SMC) - if (!kern && (family == AF_INET || family == AF_INET6) && - type == SOCK_STREAM && (protocol == IPPROTO_IP || - protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { - protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - family = AF_SMC; - } + try_tcp2smc_convert(net, &family, type, &protocol, kern); #endif err = security_socket_create(family, type, protocol, kern); -- Gitee From c4e6ee1320206a880567764608257de548d7b4e8 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:41:05 +0800 Subject: [PATCH 46/95] anolis: net/smc: Add TX and RX diagnosis information ANBZ: #1742 This patch adds RX / TX execution and data size counters for each SMC connection which will be reported in diagnosis information. Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/smc_diag.h | 6 ++++++ net/smc/smc.h | 6 ++++++ net/smc/smc_core.c | 15 +++++++++++++++ net/smc/smc_diag.c | 6 ++++++ net/smc/smc_rx.c | 2 ++ net/smc/smc_tx.c | 8 +++++++- 6 files changed, 42 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 8cb3a6fef553..182efdd3ec91 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -79,6 +79,12 @@ struct smc_diag_conninfo { struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ struct smc_diag_cursor tx_sent; /* sent cursor */ struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ + __u64 rx_cnt; /* rx counter */ + __u64 tx_cnt; /* tx counter */ + __u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + __u64 rx_bytes; /* rx size */ + __u64 tx_bytes; /* tx size */ + __u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ }; /* SMC_DIAG_LINKINFO */ diff --git a/net/smc/smc.h b/net/smc/smc.h index 0f1a51ae6d15..9d73fc5fdbc2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -228,6 +228,12 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ + u64 rx_cnt; /* rx counter */ + u64 tx_cnt; /* tx counter */ + u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + u64 rx_bytes; /* rx size */ + u64 tx_bytes; /* tx size */ + u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termiation */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0b833b73dd6f..034fd06208e5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1861,6 +1861,20 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } +static void smc_rx_tx_counter_init(struct smc_connection *conn) +{ + /* Initialize RX & TX diagnostic inform for each + * connection. These counters mean what smc wants + * net devices "TODO" insead of what has been "DONE" + */ + conn->rx_cnt = 0; + conn->tx_cnt = 0; + conn->tx_corked_cnt = 0; + conn->rx_bytes = 0; + conn->tx_bytes = 0; + conn->tx_corked_bytes = 0; +} + /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1945,6 +1959,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); + smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8d436e42a85b..bbe00b50b666 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,6 +136,12 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, + .rx_cnt = conn->rx_cnt, + .tx_cnt = conn->tx_cnt, + .tx_corked_cnt = conn->tx_corked_cnt, + .rx_bytes = conn->rx_bytes, + .tx_bytes = conn->tx_bytes, + .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 17c5aee7ee4f..4b548e118268 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -450,6 +450,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); + conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -497,6 +498,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); + ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 4e8377657a62..55a135345ebf 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -282,8 +282,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ - if (!smc_tx_should_cork(smc, msg)) + if (!smc_tx_should_cork(smc, msg)) { + conn->tx_bytes += copylen; + ++conn->tx_cnt; smc_tx_sndbuf_nonempty(conn); + } else { + conn->tx_corked_bytes += copylen; + ++conn->tx_corked_cnt; + } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From b2af6339efdc269fdf0d542aa4d2b29ed60dd950 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 2 Sep 2021 13:19:26 +0800 Subject: [PATCH 47/95] anolis: net/smc: don't call ib_req_notify_cq in the send routine ANBZ: #1742 We can just call ib_req_notify_cq() when the link got ready, and rearm it after poll_cq(). Which is enough to make sure we won't miss any events. Simple sockperf test show about 20% gain in throughput test with small messages. Test command: client: smc_run sockperf tp -i $SERVER -m 14 -t 30 --tcp server: smc_run sockperf sr --tcp Without this: Summary: BandWidth is 6.504 MBps (52.034 Mbps) With this: Summary: BandWidth is 7.846 MBps (62.771 Mbps) Signed-off-by: Dust Li Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 6 ++++++ net/smc/smc_wr.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 1cb600767e88..ef4fea545d0f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,6 +135,12 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; + + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 26f8f240d9e8..261d8b44d275 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -306,8 +306,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 8ed3f1d8ccc8922307e877e2bfb18dcc8a8cf0c1 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 22 Sep 2021 11:17:18 +0800 Subject: [PATCH 48/95] anolis: net/smc: allow different subnet communication ANBZ: #1742 SMC checks prefix to ensure that peers are in the same subnet. But it is no need to check this for iWARP over ERDMA, for ERDMA can communicate each others beyound subnet. So we provide a sysctl knob allow_different_subnet to support it. Signed-off-by: Tony Lu Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 11 +++++++---- net/smc/smc_sysctl.c | 10 ++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 38589bb498c9..20581592fc29 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -29,5 +29,6 @@ struct netns_smc { int sysctl_wmem_default; int sysctl_rmem_default; int sysctl_tcp2smc; + int sysctl_allow_different_subnet; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 664ea58aa3a0..df59062f59d7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2249,6 +2249,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { + struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -2256,10 +2257,12 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + if (!net->smc.sysctl_allow_different_subnet) { + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); + } /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e4d09da6de59..ae9d36986b72 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -64,6 +64,15 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "allow_different_subnet", + .data = &init_net.smc.sysctl_allow_different_subnet, + .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -92,6 +101,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_wmem_default = 256 * 1024; net->smc.sysctl_rmem_default = 384 * 1024; net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_allow_different_subnet = 1; return 0; -- Gitee From 753f373cc1d4cf8a6cec88e8623d07620ea86501 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 16 Dec 2021 17:38:05 +0800 Subject: [PATCH 49/95] anolis: net/smc: Avoid unmapping bufs from unused links ANBZ: #1742 ANBZ: #264 smcr_buf_free() intends to unmap each link of link group from a specific buf_desc according to lnk->link_idx. However, if the link has already been cleared before, its lnk->link_idx is 0 and smcr_buf_unmap_link() will repeatedly try to unmap lnk[0] from a buf_desc. The wrong lnk->link_idx won't cause any problems currently because unused links has unmapped bufs from itself in smcr_link_clear(). But the wrong lnk->link_idx doesn't match the semantic, so it is better to avoid ummapping an unused link. Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 034fd06208e5..9b3d1936ff20 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1307,8 +1307,11 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + continue; smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); + } if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); -- Gitee From edb359c0219c5f5ba774b79515075a6c1bcbb7c4 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 11 Feb 2022 18:12:25 +0800 Subject: [PATCH 50/95] anolis: net/smc: Add sysctl conrtol for handshake limiation ANBZ: #1742 ANBZ: #264 see commit: net/smc: Add global configure for handshake limitation by netlink This patch just add sysctl contoler for anolis. Signed-off-by: D. Wythe Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 +- net/smc/smc_sysctl.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 20581592fc29..15fd18fe51e3 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,7 +19,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - bool limit_smc_hs; /* constraint on handshake */ + int limit_smc_hs; /* constraint on handshake */ struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index ae9d36986b72..69d12e6ad023 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -73,6 +73,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "limit_handshake", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(init_net.smc.limit_smc_hs), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- Gitee From 9c282cacc2f949ea68d29d33482580fc5d644312 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 00:15:51 +0800 Subject: [PATCH 51/95] anolis: net/smc: Support rq flow control in smc-r link layer ANBZ: #1742 ANBZ: #254 This patch supports rq flow control in smc-r link layer. QPs communicating without rq flow control, in the previous version, may result in RNR (reveive not ready) error, which means when sq sends a message to the remote qp, but the remote qp's rq has no rq entities to receive the message. In RNR situation, the rdma transport layer may retransmit the messages again and again until the rq has any entities, which may lower the performance, especially in heavy traffic. Using credits to do rq flow control can avoid the occurrence of RNR. The test of redis-benchmark shows that more than 3X rps improvement in SET and more than 7X rps improvement in GET. Test command: redis-server --save "" --appendonly no --protected-mode no --io-threads 7 --io-threads-do-reads yes redis-benchmark -h 192.168.26.36 -q -t set,get -P 1 --threads 7 -n 2000000 -c 500 -d 10 Before: SET: 173325.25 requests per second, p50=2.703 msec GET: 81383.52 requests per second, p50=5.575 msec After: SET: 554323.69 requests per second, p50=0.959 msec GET: 604741.19 requests per second, p50=0.855 msec Signed-off-by: Guangguan Wang Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 12 ++++++ net/smc/smc_cdc.c | 12 +++++- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 ++ net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 ++++++++- net/smc/smc_ib.c | 6 ++- net/smc/smc_llc.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 5 +++ net/smc/smc_wr.c | 31 +++++++++++++--- net/smc/smc_wr.h | 54 ++++++++++++++++++++++++++- 11 files changed, 223 insertions(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index df59062f59d7..be59e86fda96 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -703,6 +703,13 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; + link->credits_enable = clc->r0.init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, clc->r0.init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); + } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -1258,6 +1265,11 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { + reason_code = SMC_CLC_DECL_CREDITSERR; + goto connect_abort; + } + /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 53f63bfbaf5f..410134dccbf9 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,25 +111,30 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; + struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; + u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + smc_host_msg_to_cdc(cdc_msg, conn, &cfed); + saved_credits = (u8)smc_wr_rx_get_credits(link); + cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -445,6 +450,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ + if (cdc->credits) + smc_wr_tx_put_credits(link, cdc->credits, true); + /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303..145ce7997e64 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,7 +47,8 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 reserved[18]; + u8 credits; /* credits synced by every cdc msg */ + u8 reserved[17]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1472f31480d8..ba20049ef6ce 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1040,9 +1040,12 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; + clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + clc->r0.init_credits = + link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fee545c9a10..7b068f7e0519 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,6 +63,7 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -190,7 +191,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 reserved; + u8 init_credits; /* QP rq init credits for rq flowctrl */ __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f9b7dd15479d..7f53309ad796 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,7 +21,12 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, + * SMC_WR_BUF_CNT should not be less than 2 * + * SMC_RMBS_PER_LGR_MAX, since every connection at + * least has two rq/sq credits in average, otherwise + * may result in waiting for credits in sending process. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -80,6 +85,8 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 +#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 + struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -124,6 +131,14 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ + atomic_t local_rq_credits; /* credits for local rq flowctrl */ + u8 credits_enable; /* credits enable flag, set when negotiation */ + u8 local_cr_watermark_high; /* local rq credits watermark */ + u8 peer_cr_watermark_low; /* peer rq credits watermark */ + struct work_struct credits_announce_work; /* work for credits announcement */ + unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index ef4fea545d0f..5a183b754851 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -670,10 +670,12 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND + * there are max. 2 RDMA_WRITE per 1 WR_SEND. + * RDMA_WRITE consumes send queue entities, + * without recv queue entities. */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, .max_inline_data = 0, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 65552428e2ab..4052651152c1 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,7 +75,8 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 reserved[8]; + u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved[7]; }; struct smc_llc_msg_add_link_cont_rt { @@ -170,6 +171,12 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_announce_credits { /* type 0x0A */ + struct smc_llc_hdr hd; + u8 credits; + u8 reserved[39]; +}; + struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -189,6 +196,7 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; + struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -752,6 +760,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } +/* send credits announce request or response */ +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force) +{ + struct smc_llc_msg_announce_credits *announce_credits; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + u8 saved_credits = 0; + + if (!link->credits_enable || + (!force && !smc_wr_rx_credits_need_announce(link))) + return 0; + + saved_credits = (u8)smc_wr_rx_get_credits(link); + if (!saved_credits) + /* maybe synced by cdc msg */ + return 0; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) { + smc_wr_rx_put_credits(link, saved_credits); + return rc; + } + + announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; + memset(announce_credits, 0, sizeof(*announce_credits)); + announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; + announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); + if (reqresp == SMC_LLC_RESP) + announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; + announce_credits->credits = saved_credits; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + if (rc) + smc_wr_rx_put_credits(link, saved_credits); + + return rc; +} + /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1015,6 +1063,13 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; + link->credits_enable = add_llc->init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, add_llc->init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); + } } /* as an SMC client, process an add link request */ @@ -1935,6 +1990,10 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); + break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2020,6 +2079,10 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); + break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2113,6 +2176,27 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +static void smc_llc_announce_credits_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, + struct smc_link, credits_announce_work); + int rc, retry = 0, agains = 0; + +again: + do { + rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); + } while ((rc == -EBUSY) && smc_link_sendable(link) && + (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); + + if (smc_wr_rx_credits_need_announce(link) && + smc_link_sendable(link) && agains <= 5 && !rc) { + agains++; + goto again; + } + + clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); +} + void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2148,6 +2232,7 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2179,6 +2264,7 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); + cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2293,6 +2379,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ANNOUNCE_CREDITS + }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 4404e52b3346..f8a14643faf4 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,6 +20,8 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) + enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -35,6 +37,7 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -86,6 +89,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 261d8b44d275..55c1deb6bc7f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -130,7 +130,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - wake_up(&link->wr_tx_wait); + if (wq_has_sleeper(&link->wr_tx_wait)) + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) @@ -173,11 +174,16 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; + + if (!smc_wr_tx_get_credit(link)) + return -EBUSY; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; + smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -283,7 +289,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - wake_up(&link->wr_tx_wait); + smc_wr_tx_put_credits(link, 1, true); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -469,6 +475,12 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } + + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); + } } } @@ -511,6 +523,8 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); return rc; } @@ -545,7 +559,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_recv_wr); } @@ -737,7 +751,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -745,7 +759,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -764,7 +778,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -887,6 +901,11 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + atomic_set(&lnk->peer_rq_credits, 0); + atomic_set(&lnk->local_rq_credits, 0); + lnk->flags = 0; + lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); + lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..8cf276215c91 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,12 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -83,6 +88,51 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } +// get one tx credit, and peer rq credits dec +static inline int smc_wr_tx_get_credit(struct smc_link *link) +{ + return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; +} + +// put tx credits, when some failures occurred after tx credits got +// or receive announce credits msgs +static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) +{ + if (link->credits_enable && credits) { + atomic_add(credits, &link->peer_rq_credits); + if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) + wake_up_nr(&link->wr_tx_wait, credits); + } +} + +// to check whether peer rq credits is lower than watermark. +static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; +} + +// get local rq credits and set credits to zero. +// may called when announcing credits +static inline int smc_wr_rx_get_credits(struct smc_link *link) +{ + return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; +} + +// called when post_recv a rqe +static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) +{ + if (link->credits_enable && credits) + atomic_add(credits, &link->local_rq_credits); +} + +// to check whether local rq credits is higher than watermark. +static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -95,6 +145,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + if (!rc) + smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From cf862259ce50a812ed41d3cef76daa58c8eb34d6 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 01:04:22 +0800 Subject: [PATCH 52/95] anolis: net/smc: Introduce link-related proc file ANBZ: #1742 ANBZ: #346 This patch introduces link-related proc files to report statistics information of SMC-R links. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_proc.c | 58 +++++++++++++++++++++++++++++++++++++++++++--- net/smc/smc_proc.h | 10 ++++---- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index 19d8cc82a7ac..106887b7b9e1 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -154,9 +154,11 @@ static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, + smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); } else { - seq_puts(seq, "- - - - - - - -\n"); + seq_puts(seq, "- - - - - - -" + " - - -\n"); } } @@ -170,7 +172,7 @@ static int smc_conn_show(struct seq_file *seq, void *v) seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); goto out; } @@ -234,6 +236,51 @@ static struct smc_proc_entry smc_proc[] = { #endif }; +extern struct smc_lgr_list smc_lgr_list; +static int proc_show_links(struct seq_file *seq, void *v) +{ + struct smc_link_group *lgr, *lg; + struct smc_link *lnk; + int i = 0, j = 0; + + seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", + "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", + "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!smc_link_usable(lnk)) + continue; + for (j = 0; j < SMC_LGR_ID_SIZE; j++) + seq_printf(seq, "%02X", lgr->id[j]); + seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", + lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, + lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, + lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, + lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, + atomic_read(&lnk->local_rq_credits), + atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, + lnk->peer_cr_watermark_low, lnk->flags); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + return 0; +} + +static int proc_open_links(struct inode *inode, struct file *file) +{ + single_open(file, proc_show_links, NULL); + return 0; +} + +static struct proc_ops link_file_ops = { +.proc_open = proc_open_links, +.proc_read = seq_read, +.proc_release = single_release, +}; + static int __net_init smc_proc_dir_init(struct net *net) { int i, rc = -ENOMEM; @@ -250,6 +297,9 @@ static int __net_init smc_proc_dir_init(struct net *net) goto err_entry; } + if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) + goto err_entry; + return 0; err_entry: @@ -265,6 +315,8 @@ static void __net_exit smc_proc_dir_exit(struct net *net) { int i; + remove_proc_entry("links", net->proc_net_smc); + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) remove_proc_entry(smc_proc[i].name, net->proc_net_smc); diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h index ec59ca03e163..faa5eaaee511 100644 --- a/net/smc/smc_proc.h +++ b/net/smc/smc_proc.h @@ -9,12 +9,14 @@ #include #include "smc.h" -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") #define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") #define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") struct smc_proc_private { struct seq_net_private p; -- Gitee From 294ef3fd9693898eefe647f3af3d199f4e810b17 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 13 Jan 2022 17:06:19 +0800 Subject: [PATCH 53/95] anolis: net/smc: Introduce smc_ib_cq to bind link and cq ANBZ: #1742 ANBZ: #264 This patch introduces struct smc_ib_cq as a medium between smc_link and ib_cq. Every smc_link can access ib_cq from their own, and unbinds smc_link from smc_ib_device. This allows flexible mapping, prepares for multiple CQs support. Signed-off-by: Tony Lu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.h | 2 ++ net/smc/smc_ib.c | 86 ++++++++++++++++++++++++++++++++-------------- net/smc/smc_ib.h | 13 ++++--- net/smc/smc_wr.c | 32 ++++++++--------- 4 files changed, 88 insertions(+), 45 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 7f53309ad796..9130a6f87264 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,6 +94,8 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ + struct smc_ib_cq *smcibcq_recv; /* cq for recv */ + struct smc_ib_cq *smcibcq_send; /* cq for send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 5a183b754851..251cc60b7c8c 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -656,6 +656,8 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; + lnk->smcibcq_send = NULL; + lnk->smcibcq_recv = NULL; } /* create a queue pair within the protection domain for a link */ @@ -665,8 +667,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->roce_cq_send, - .recv_cq = lnk->smcibdev->roce_cq_recv, + .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, + .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -693,10 +695,13 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) + if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; - else + } else { + lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; + lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; smc_wr_remember_qp_attr(lnk); + } return rc; } @@ -843,10 +848,21 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } +static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) +{ + ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); + kfree(smcibdev->ib_cq_send); + smcibdev->ib_cq_send = NULL; + + ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); + kfree(smcibdev->ib_cq_recv); + smcibdev->ib_cq_recv = NULL; +} + long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { - .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; long rc; @@ -859,28 +875,49 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); - if (IS_ERR(smcibdev->roce_cq_send)) { - smcibdev->roce_cq_send = NULL; + smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); + if (!smcibcq_send) { + rc = -ENOMEM; + goto out; + } + smcibcq_send->smcibdev = smcibdev; + smcibcq_send->is_send = 1; + cqattr.comp_vector = 0; + smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq_send, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); + if (IS_ERR(smcibdev->ib_cq_send)) { + smcibdev->ib_cq_send = NULL; goto out; } - smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); - if (IS_ERR(smcibdev->roce_cq_recv)) { - smcibdev->roce_cq_recv = NULL; - goto err; + smcibdev->ib_cq_send = smcibcq_send; + + smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); + if (!smcibcq_recv) { + rc = -ENOMEM; + goto err_send; + } + smcibcq_recv->smcibdev = smcibdev; + cqattr.comp_vector = 1; + smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq_recv, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); + if (IS_ERR(smcibdev->ib_cq_recv)) { + smcibdev->ib_cq_recv = NULL; + goto err_recv; } + smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err: - ib_destroy_cq(smcibdev->roce_cq_send); +err_recv: + kfree(smcibcq_recv); + ib_destroy_cq(smcibcq_send->ib_cq); +err_send: + kfree(smcibcq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -892,8 +929,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - ib_destroy_cq(smcibdev->roce_cq_recv); - ib_destroy_cq(smcibdev->roce_cq_send); + smc_ib_cleanup_cq(smcibdev); smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 034295676e88..15b213f19c6e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,15 +32,20 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ +struct smc_ib_cq { /* ib_cq wrapper for smc */ + struct smc_ib_device *smcibdev; /* parent ib device */ + struct ib_cq *ib_cq; /* real ib_cq for link */ + struct tasklet_struct tasklet; /* tasklet for wr */ + bool is_send; /* send for recv cq */ +}; + struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct ib_cq *roce_cq_send; /* send completion queue */ - struct ib_cq *roce_cq_recv; /* recv completion queue */ - struct tasklet_struct send_tasklet; /* called by send cq handler */ - struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + struct smc_ib_cq *ib_cq_send; /* send completion queue */ + struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 55c1deb6bc7f..b30c23469704 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -136,7 +136,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -145,9 +145,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_send, + ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); } @@ -162,9 +162,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->send_tasklet); + tasklet_schedule(&smcibcq->tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -327,7 +327,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -371,7 +371,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -486,7 +486,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -495,9 +495,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_recv, + ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); } @@ -511,9 +511,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->recv_tasklet); + tasklet_schedule(&smcibcq->tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -845,14 +845,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->recv_tasklet); - tasklet_kill(&smcibdev->send_tasklet); + tasklet_kill(&smcibdev->ib_cq_recv->tasklet); + tasklet_kill(&smcibdev->ib_cq_send->tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 106b2da1ac2655e63ef1e499fd70e4cf6682f867 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 13 Jan 2022 17:34:53 +0800 Subject: [PATCH 54/95] anolis: net/smc: Multiple CQs per IB devices ANBZ: #1742 ANBZ: #264 This allows multiple CQs for one IB device, compared to one CQ now. During IB device setup, it would initialize ibdev->num_comp_vectors amount of send/recv CQs, and the corresponding tasklets, like queues for net devices. Every smc_link has their own send and recv CQs, which always assigning from the least used CQs of current IB device. Signed-off-by: Tony Lu Reviewed-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 139 +++++++++++++++++++++++++++++++---------------- net/smc/smc_ib.h | 6 +- net/smc/smc_wr.c | 18 ++++-- 3 files changed, 111 insertions(+), 52 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 251cc60b7c8c..e1b09307da06 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -630,6 +630,36 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, + bool is_send) +{ + struct smc_ib_cq *smcibcq, *cq; + int min, i; + + if (is_send) + smcibcq = smcibdev->smcibcq_send; + else + smcibcq = smcibdev->smcibcq_recv; + + cq = smcibcq; + min = cq->load; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibcq[i].load < min) { + cq = &smcibcq[i]; + min = cq->load; + } + } + + cq->load++; + return cq; +} + +static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) +{ + smcibcq->load--; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -653,8 +683,11 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) + if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); + smc_ib_put_cq(lnk->smcibcq_send); + smc_ib_put_cq(lnk->smcibcq_recv); + } lnk->roce_qp = NULL; lnk->smcibcq_send = NULL; lnk->smcibcq_recv = NULL; @@ -663,12 +696,16 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, + true); + struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, + false); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, - .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, + .send_cq = smcibcq_send->ib_cq, + .recv_cq = smcibcq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -698,8 +735,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; - lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; + lnk->smcibcq_send = smcibcq_send; + lnk->smcibcq_recv = smcibcq_recv; smc_wr_remember_qp_attr(lnk); } return rc; @@ -850,20 +887,26 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { - ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); - kfree(smcibdev->ib_cq_send); - smcibdev->ib_cq_send = NULL; + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibdev->smcibcq_send[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); + + if (smcibdev->smcibcq_recv[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); + } - ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); - kfree(smcibdev->ib_cq_recv); - smcibdev->ib_cq_recv = NULL; + kfree(smcibdev->smcibcq_send); + kfree(smcibdev->smcibcq_recv); } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; - struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; + struct smc_ib_cq *smcibcq; + int i, num_cq_peer; long rc; mutex_lock(&smcibdev->mutex); @@ -875,49 +918,53 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); - if (!smcibcq_send) { + num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq_peer = num_cq_peer; + smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_send) { rc = -ENOMEM; - goto out; - } - smcibcq_send->smcibdev = smcibdev; - smcibcq_send->is_send = 1; - cqattr.comp_vector = 0; - smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq_send, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); - if (IS_ERR(smcibdev->ib_cq_send)) { - smcibdev->ib_cq_send = NULL; - goto out; + goto err; } - smcibdev->ib_cq_send = smcibcq_send; - - smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); - if (!smcibcq_recv) { + smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_recv) { rc = -ENOMEM; - goto err_send; + goto err; } - smcibcq_recv->smcibdev = smcibdev; - cqattr.comp_vector = 1; - smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq_recv, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); - if (IS_ERR(smcibdev->ib_cq_recv)) { - smcibdev->ib_cq_recv = NULL; - goto err_recv; + + /* initialize CQs */ + for (i = 0; i < num_cq_peer; i++) { + /* initialize send CQ */ + smcibcq = &smcibdev->smcibcq_send[i]; + smcibcq->smcibdev = smcibdev; + smcibcq->is_send = 1; + cqattr.comp_vector = i; + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; + + /* initialize recv CQ */ + smcibcq = &smcibdev->smcibcq_recv[i]; + smcibcq->smcibdev = smcibdev; + cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; } - smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err_recv: - kfree(smcibcq_recv); - ib_destroy_cq(smcibcq_send->ib_cq); -err_send: - kfree(smcibcq_send); +err: + smc_ib_cleanup_cq(smcibdev); out: mutex_unlock(&smcibdev->mutex); return rc; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 15b213f19c6e..456d59670031 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -37,6 +37,7 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ bool is_send; /* send for recv cq */ + int load; /* load of current cq */ }; struct smc_ib_device { /* ib-device infos for smc */ @@ -44,8 +45,9 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct smc_ib_cq *ib_cq_send; /* send completion queue */ - struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ + int num_cq_peer; /* num of snd/rcv cq peer */ + struct smc_ib_cq *smcibcq_send; /* send cqs */ + struct smc_ib_cq *smcibcq_recv; /* recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index b30c23469704..937339fd1fdb 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -845,14 +845,24 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->ib_cq_recv->tasklet); - tasklet_kill(&smcibdev->ib_cq_send->tasklet); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); + tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); + } } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, + smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, + smc_wr_rx_tasklet_fn); + } } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 9736f1141e2fb2f9e76fd944af4234de50177137 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 14 Feb 2022 16:31:21 +0800 Subject: [PATCH 55/95] anolis: net/smc: Keep first contact clcsock ANBZ: #1742 ANBZ: #264 This introduces a work around for eRDMA. eRDMA reuse the first TCP tuple to create QP, and don't want to release it. But SMC will release this tuple when first contact connection is shutdown. This patch keeps the first contact connection, and delay the shutdown work to link (QP) release progress. Be careful, this patch reverses TCP close process, which means server side closes clcsock when link clears. Signed-off-by: Tony Lu Reviewed-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 3 ++- net/smc/smc.h | 1 + net/smc/smc_close.c | 7 +++++-- net/smc/smc_core.c | 6 ++++++ net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 3 +++ net/smc/smc_sysctl.c | 10 ++++++++++ 8 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 15fd18fe51e3..a532c9e6fe5a 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -30,5 +30,6 @@ struct netns_smc { int sysctl_rmem_default; int sysctl_tcp2smc; int sysctl_allow_different_subnet; + int sysctl_keep_first_contact_clcsock; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index be59e86fda96..9898b5794891 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -387,6 +387,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_sndbuf = net->smc.sysctl_wmem_default; sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); + smc->keep_clcsock = 0; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -2857,7 +2858,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock) + if (do_shutdown && smc->clcsock && !smc->keep_clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; diff --git a/net/smc/smc.h b/net/smc/smc.h index 9d73fc5fdbc2..05864aeb7909 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -253,6 +253,7 @@ struct smc_sock { /* smc sock container */ /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ + bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 31db7438857c..038bcafe9a9e 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -28,10 +28,12 @@ void smc_clcsock_release(struct smc_sock *smc) if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); + /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; - sock_release(tcp); + if (!smc->keep_clcsock) + sock_release(tcp); } mutex_unlock(&smc->clcsock_release_lock); } @@ -239,7 +241,8 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk) { + if (smc->clcsock && smc->clcsock->sk && + !smc->keep_clcsock) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9b3d1936ff20..2b4440f53101 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -916,6 +916,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); + lnk->clcsock = smc->clcsock; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -1264,6 +1265,8 @@ static void __smcr_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); + if (lnk->clcsock) + sock_release(lnk->clcsock); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); @@ -1942,6 +1945,9 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { + /* keep this clcsock for QP reuse */ + if (net->smc.sysctl_keep_first_contact_clcsock) + smc->keep_clcsock = 1; rc = smc_lgr_create(smc, ini); if (rc) goto out; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 9130a6f87264..26e31abe501e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -167,6 +167,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + + struct socket *clcsock; /* keep for eRDMA */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4052651152c1..7a5c29fa4a44 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1119,6 +1119,8 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; lnk_new->iw_conn_param = link->iw_conn_param; + lnk_new->clcsock = link->clcsock; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; @@ -1490,6 +1492,7 @@ int smc_llc_srv_add_link(struct smc_link *link, } lgr->lnk[lnk_idx].iw_conn_param = link->iw_conn_param; + lgr->lnk[lnk_idx].clcsock = link->clcsock; rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) goto out; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 69d12e6ad023..f9dc4f35c5e2 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -82,6 +82,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "keep_first_contact_clcsock", + .data = &init_net.smc.sysctl_keep_first_contact_clcsock, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -111,6 +120,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_rmem_default = 384 * 1024; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; + net->smc.sysctl_keep_first_contact_clcsock = 1; return 0; -- Gitee From b038f25b696c8db3650612c20f8784e79368acb5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Mon, 7 Mar 2022 13:16:28 +0800 Subject: [PATCH 56/95] anolis: net/smc: Introduce a sysctl to disable {a}symmetric link group ANBZ: #1742 ANBZ: #264 When smc uses erdma as underlay implementation, smc link (rdma connection) is created according to five-tupe of tcp connection. However, the second smc link can't be created correctly if there is no another tcp connection. So we decide to disable {a}symmetric link group defaultly in erdma environment by a sysctl as a workaround. Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 36 +++++++++++++++++++++--------------- net/smc/smc_core.c | 3 +++ net/smc/smc_llc.c | 9 +++++++++ net/smc/smc_sysctl.c | 10 ++++++++++ 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index a532c9e6fe5a..2022b6a9b745 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -31,5 +31,6 @@ struct netns_smc { int sysctl_tcp2smc; int sysctl_allow_different_subnet; int sysctl_keep_first_contact_clcsock; + int sysctl_disable_multiple_link; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9898b5794891..06ef616d3633 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -556,6 +556,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; + struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -602,20 +603,22 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; + if (!net->smc.sysctl_disable_multiple_link) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); return 0; } @@ -1803,6 +1806,7 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; + struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -1843,8 +1847,10 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); + if (!net->smc.sysctl_disable_multiple_link) { + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + } return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2b4440f53101..2ac4dfdfa240 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1678,6 +1678,9 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 7a5c29fa4a44..a0149951d774 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1090,6 +1090,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) rc = -ENOMEM; goto out_reject; } + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out_reject; ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { @@ -1217,6 +1220,9 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out; ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) @@ -1462,6 +1468,9 @@ int smc_llc_srv_add_link(struct smc_link *link, rc = -ENOMEM; goto out; } + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out; /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index f9dc4f35c5e2..f846b2e8e765 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -91,6 +91,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "disable_multiple_link", + .data = &init_net.smc.sysctl_disable_multiple_link, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -121,6 +130,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; net->smc.sysctl_keep_first_contact_clcsock = 1; + net->smc.sysctl_disable_multiple_link = 1; return 0; -- Gitee From 172bf494daa479b5c520777dfc4791fb9b989d19 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 15 Mar 2022 15:52:29 +0800 Subject: [PATCH 57/95] anolis: net/smc: Introduce rtoken validity check before sending ANBZ: #1742 ANBZ: #264 The local peer might be still sending data when receiving remote peer requests for deleting rtoken. So the local peer might use an already deleted rkey for rdma write operation. In eRDMA scenario, this may cause a hung because eRDMA driver won't generate CQEs for sending with wrong rkey and cdc_pend_tx_wr won't reach zero anymore. So this patch tries to fix this by checking rtoken validity before rdma write operation. This won't cause data loss because at this moment, the remote peer must be SMC_CLOSE state and no longer want to receive any data. Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_tx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 55a135345ebf..c7beaa1f38d9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -357,6 +357,12 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; + /* rtoken might be deleted if peer freed connection */ + if (!rdma_wr->rkey || + (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { + pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); + return -EINVAL; + } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); -- Gitee From 0969b6532fccf25d6280cc8c267ae05df7545057 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 22 Mar 2022 17:27:04 +0800 Subject: [PATCH 58/95] anolis: net/smc: don't req_notify until all CQEs drained ANBZ: #1742 ANBZ: #264 When we are handling softirq workload, enable hardirq may again interrupt the current routine of softirq, and then try to raise softirq again. This only wastes CPU cycles and won't have any real gain. Since IB_CQ_REPORT_MISSED_EVENTS already make sure if ib_req_notify_cq() returns 0, it is safe to wait for the next event, with no need to poll the CQ again in this case. This patch disables hardirq during the processing of softirq, and re-arm the CQ after softirq is done. Somehow like NAPI. Co-developed-by: Guangguan Wang Signed-off-by: Guangguan Wang Signed-off-by: Dust Li Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 49 +++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 937339fd1fdb..5a5a2f4ea9d0 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -138,25 +138,28 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i = 0, rc; - int polled = 0; + int i, rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) - break; for (i = 0; i < rc; i++) smc_wr_tx_process_cqe(&wc[i]); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. + */ + break; } while (rc > 0); - if (polled == 1) + + /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, + * then it is safe to wait for the next event; else we must poll the + * CQ again to make sure we won't miss any event. + */ + if (ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; } @@ -488,24 +491,28 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int polled = 0; int rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_SOLICITED_MASK - | IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) + if (rc > 0) + smc_wr_rx_process_cqes(&wc[0], rc); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. + */ break; - smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - if (polled == 1) + + /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, + * then it is safe to wait for the next event; else we must poll the + * CQ again to make sure we won't miss any event. + */ + if (ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; } -- Gitee From 72399d1885a4d80b29296753c2a485b68d230331 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 26 Apr 2022 21:25:10 +0800 Subject: [PATCH 59/95] anolis: net/smc: Fix NULL sk pointer when access clcsock ANBZ: #1742 This patch fixes NULL sk pointer in clcsock. Signed-off-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 5 ++++- net/smc/smc_clc.c | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 06ef616d3633..324e9451a704 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1860,8 +1860,11 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + mutex_lock(&new_smc->clcsock_release_lock); + if (new_smc->clcsock && new_smc->clcsock->sk && + tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); + mutex_unlock(&new_smc->clcsock_release_lock); if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ba20049ef6ce..9a75119b3437 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,7 +795,13 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock || !smc->clcsock->sk) { + mutex_unlock(&smc->clcsock_release_lock); + return -EPROTO; + } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + mutex_unlock(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; -- Gitee From c073c77863fc5628ce3b8ba544ed516dade0f710 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Apr 2022 15:24:43 +0800 Subject: [PATCH 60/95] anolis: net/smc: Avoid clcsock access panic ANBZ: #1742 This patch is a set of the workaround for clcsock access panic. There are two kinds of invalid access of clcsock. 1) Access smc->clcsock when smc->clcsock is reset to NULL; 2) Access smc->clcsock->sk when sock_release(clcsock); In upstream implementation, only 1) happens, and it is fixed by c0bf3d8a943b ("net/smc: Transitional solution for clcsock race issue"). In anolis implementation, 1) and 2) are both reproduced. They are mainly triggered by c5e5a9f9c5d8 ("net/smc: Keep first contact clcsock"). In anolis smc implementation, The first contact's clcsock is saved in link struct and may be released during smc link clear. After that, if smc->clcsock is accessed, a NULL pointer panic will happen. This patch provides a workaround for these. To eradicate such issues, We may need to avoid using first contact's clcsock as erdma link. Fixes: c0bf3d8a943b ("net/smc: Transitional solution for clcsock race issue"). Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 324e9451a704..3e990d222350 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1633,6 +1633,11 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, break; } + if (!smc->clcsock || + (smc->clcsock && !smc->clcsock->sk)) { + rc = -EBADF; + goto out; + } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { @@ -1695,10 +1700,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); + mutex_lock(&lsmc->clcsock_release_lock); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); + mutex_unlock(&lsmc->clcsock_release_lock); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; -- Gitee From edc035718717c1bde5eafd327cec5db11d687de2 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 6 May 2022 10:19:04 +0800 Subject: [PATCH 61/95] anolis: net/smc: do not send msg in receiving process when tx is not blocked. ANBZ: #1742 As user send thread(normal send path) and tx completion tasklet(corked send path) will send msgs, there is no need to send msg in recv completion tasklet when RMB's ci updated and smc_tx_prepared_sends, which may slower the recv performance as recv completion tasklet is shared by multiple connections, but write_blocked condition. In netty benchamrk, show 28% improvement in throughput: Before: throughput cpu sys usr thread-480 connect-48 len-8: 1653807.614 124.755 69.0489 55.7061 After: throughput cpu sys usr thread-480 connect-48 len-8: 2113879.617 132.117 67.9467 64.1707 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 410134dccbf9..482e60753216 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -363,7 +363,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if ((diff_cons && smc_tx_prepared_sends(conn)) || + if ((diff_cons && smc_tx_prepared_sends(conn) && + conn->local_tx_ctrl.prod_flags.write_blocked) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || conn->local_rx_ctrl.prod_flags.urg_data_pending) { if (!sock_owned_by_user(&smc->sk)) -- Gitee From 4406c9bfb0d96516c30692d20476087f614e9647 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 6 May 2022 10:45:52 +0800 Subject: [PATCH 62/95] anolis: net/smc: compress frequency of credits announcement by cdc msg ANBZ: #1742 When in heavy traffic, credits token by cdc msg maybe few and wakeup frequently when credits update in recv side, which may use more cpu. Set announcement wartermark, which is 10% of local rq credits, can compress the announcement frequecy, and the credits taken by cdc msg is more than 10% of local rq credits, reduce the wakeup frequency in the recv side. In netty benchamrk, show 28% improvement in throughput: Before: throughput cpu sys usr thread-480 connect-48 len-8: 1653807.614 124.755 69.0489 55.7061 After: throughput cpu sys usr thread-480 connect-48 len-8: 2113879.617 132.117 67.9467 64.1707 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 3 ++- net/smc/smc_core.h | 1 + net/smc/smc_wr.c | 5 +++++ net/smc/smc_wr.h | 11 +++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 482e60753216..25b836df9f50 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -121,7 +121,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); + if (smc_wr_rx_credits_need_announce_frequent(link)) + saved_credits = (u8)smc_wr_rx_get_credits(link); cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 26e31abe501e..a823e6a7a537 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -138,6 +138,7 @@ struct smc_link { u8 credits_enable; /* credits enable flag, set when negotiation */ u8 local_cr_watermark_high; /* local rq credits watermark */ u8 peer_cr_watermark_low; /* peer rq credits watermark */ + u8 credits_update_limit; /* credits update limit for cdc msg */ struct work_struct credits_announce_work; /* work for credits announcement */ unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 5a5a2f4ea9d0..2971b3a73bf2 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -923,6 +923,11 @@ int smc_wr_create_link(struct smc_link *lnk) lnk->flags = 0; lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); lnk->peer_cr_watermark_low = 0; + + /* if credits accumlated less than 10% of wr_rx_cnt(at least 5), + * will not be announced by cdc msg. + */ + lnk->credits_update_limit = max(lnk->wr_rx_cnt / 10, 5U); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 8cf276215c91..5b671065afdc 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -133,6 +133,17 @@ static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; } +static inline int smc_wr_rx_credits_need_announce_frequent(struct smc_link *link) +{ + /* announce when local rq credits accumulated more than credits_update_limit, or + * peer rq credits is empty. As peer credits empty and local credits is less than + * credits_update_limit, may results in credits deadlock. + */ + return link->credits_enable && + (atomic_read(&link->local_rq_credits) >= link->credits_update_limit || + !atomic_read(&link->peer_rq_credits)); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { -- Gitee From 90b0d2eac7a37ed146a3163ae0cb282376e2281f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 21 Mar 2022 21:10:41 +0800 Subject: [PATCH 63/95] anolis: net/smc: Release lock before waiting for CLC accept message ANBZ: #1742 Applications use to call setsockopt() after connect(), which requires under the sock lock. Holding the sock lock during the CLC handshake may cause applications have to wait until CLC accept message got ready. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3e990d222350..4d351b0fe852 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1129,9 +1129,13 @@ static int smc_connect_clc(struct smc_sock *smc, rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; + + release_sock(&smc->sk); /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, - SMC_CLC_ACCEPT, CLC_WAIT_TIME); + rc = smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); + lock_sock(&smc->sk); + return rc; } void smc_fill_gid_list(struct smc_link_group *lgr, -- Gitee From e965a35bccb5f835569a1a542ccdf0f4addddc29 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 5 May 2022 12:57:42 +0800 Subject: [PATCH 64/95] anolis: net/smc: Disable confirm rkey message exchange when only one link exists ANBZ: #1742 If there is only one link between the two sides of communication, it is not necessary to perform confirm RKey message exchange. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4d351b0fe852..9aecf30e2747 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -523,7 +523,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; - int i, rc = 0; + int i, lnk = 0, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) @@ -538,14 +538,20 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; + /* available link count inc */ + lnk++; } - /* exchange confirm_rkey msg with peer */ - rc = smc_llc_do_confirm_rkey(link, rmb_desc); - if (rc) { - rc = -EFAULT; - goto out; + /* do not exchange confirm_rkey msg since there are only one link */ + if (lnk > 1) { + /* exchange confirm_rkey msg with peer */ + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; + } } + rmb_desc->is_conf_rkey = true; out: mutex_unlock(&lgr->llc_conf_mutex); -- Gitee From 11c081c98b8169515ffa7abdcd94291af811eb1a Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 31 Mar 2022 13:17:28 +0800 Subject: [PATCH 65/95] anolis: net/smc: Avoid syscall block by async smc_conn_free ANBZ: #1742 smc_conn_free() will wait for rkey delete message, which will block the application syscall. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 35 ++++++++++++++++++++++++++++------- net/smc/smc.h | 1 + 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9aecf30e2747..f5e47f8f5ab1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -300,13 +300,9 @@ static int __smc_release(struct smc_sock *smc) sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { - if (smc->clcsock) { - release_sock(sk); - smc_clcsock_release(smc); - lock_sock(sk); - } - if (!smc->use_fallback) - smc_conn_free(&smc->conn); + sock_hold(sk); + if (!queue_work(smc_hs_wq, &smc->free_work)) + sock_put(sk); } return rc; @@ -368,6 +364,30 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } +static void smc_free_work(struct work_struct *work) +{ + struct sock *sk; + struct smc_sock *smc = container_of(work, struct smc_sock, + free_work); + + sk = &smc->sk; + + lock_sock(sk); + if (sk->sk_state == SMC_CLOSED) { + if (smc->clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } + + if (!smc->use_fallback) + smc_conn_free(&smc->conn); + } + release_sock(sk); + + sock_put(sk); /* before queue */ +} + static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { @@ -389,6 +409,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, smc = smc_sk(sk); smc->keep_clcsock = 0; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_WORK(&smc->free_work, smc_free_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); diff --git a/net/smc/smc.h b/net/smc/smc.h index 05864aeb7909..578853227e46 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -257,6 +257,7 @@ struct smc_sock { /* smc sock container */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ + struct work_struct free_work; /* free smc conn */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ -- Gitee From 68836f5e11584b144df763b1d856357e4926fdea Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 10:47:50 +0800 Subject: [PATCH 66/95] anolis: net/smc: move wc loop out of smc_wr_rx_process_cqes ANBZ: #1742 move wc loop out of smc_wr_rx_process_cqes to align the behaviour of smc_wr_tx_process_cqe. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 52 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 2971b3a73bf2..4df29a0fafd2 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -454,36 +454,32 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) } } -static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) +static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) { - struct smc_link *link; - int i; + struct smc_link *link = wc->qp->qp_context; - for (i = 0; i < num; i++) { - link = wc[i].qp->qp_context; - if (wc[i].status == IB_WC_SUCCESS) { - link->wr_rx_tstamp = jiffies; - smc_wr_rx_demultiplex(&wc[i]); + if (wc->status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(wc); + smc_wr_rx_post(link); /* refill WR RX */ + } else { + /* handle status errors */ + switch (wc->status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); + break; + default: smc_wr_rx_post(link); /* refill WR RX */ - } else { - /* handle status errors */ - switch (wc[i].status) { - case IB_WC_RETRY_EXC_ERR: - case IB_WC_RNR_RETRY_EXC_ERR: - case IB_WC_WR_FLUSH_ERR: - smcr_link_down_cond_sched(link); - break; - default: - smc_wr_rx_post(link); /* refill WR RX */ - break; - } + break; } + } - if (smc_wr_rx_credits_need_announce(link) && - !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { - set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); - schedule_work(&link->credits_announce_work); - } + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); } } @@ -491,14 +487,14 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int rc; + int i, rc; again: do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (rc > 0) - smc_wr_rx_process_cqes(&wc[0], rc); + for (i = 0; i < rc; i++) + smc_wr_rx_process_cqe(&wc[i]); if (rc < SMC_WR_MAX_POLL_CQE) /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been * drained, no need to poll again. -- Gitee From 133624d808d1d6b201217be927f6606f20a8e5d7 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 11:37:48 +0800 Subject: [PATCH 67/95] anolis: net/smc: combine send cq and recv cq into one cq ANBZ: #1742 SMC-R uses two CQs per link, one for SQ called SCQ and the other for RQ called RCQ. RDMA supports SCQ and RCQ are the same CQ. In RDMA, more CQs means more interrupts as less cqe polled out echo poll_cq. This patch combines send cq and recv cq into one cq. Because of halving the number of CQs, fewer interrupts are generated and hi usage is lower. Nginx benchmark shows 5.8% improvement in throughput: Server test command: smc_run nginx Client test command: smc_run /opt/wrk/wrk http://ip:port -t 32 -c 992 -d 30 --latency Before: Requests/sec: 1927316.76 Transfer/sec: 295.92MB After: Requests/sec: 2039360.72 Transfer/sec: 313.13MB Redis benchmark shows 8% improvement in cpu usage: Server test command: smc_run ./redis-server --save "" --appendonly no --protected-mode no sar 1 10 (Cpu usage collect command) Client test command: smc_run ./redis-benchmark -h -q -t set -P 1 --threads 7\ -n 25000000 -c 100 -d 10 Before: CPU %user %nice %system %iowait %steal %idle all 0.90 0.00 5.44 0.00 0.00 93.66 After: CPU %user %nice %system %iowait %steal %idle all 0.87 0.00 5.00 0.00 0.00 94.13 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.h | 3 +- net/smc/smc_ib.c | 83 +++++++++++--------------------------- net/smc/smc_ib.h | 6 +-- net/smc/smc_wr.c | 99 +++++++++++++++++----------------------------- net/smc/smc_wr.h | 3 +- 5 files changed, 64 insertions(+), 130 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index a823e6a7a537..e1f613ac2f15 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,8 +94,7 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq_recv; /* cq for recv */ - struct smc_ib_cq *smcibcq_send; /* cq for send */ + struct smc_ib_cq *smcibcq; /* cq for recv & send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index e1b09307da06..6d54861b2a3f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, + rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, + rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -630,21 +630,16 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, - bool is_send) +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev) { struct smc_ib_cq *smcibcq, *cq; int min, i; - if (is_send) - smcibcq = smcibdev->smcibcq_send; - else - smcibcq = smcibdev->smcibcq_recv; - + smcibcq = smcibdev->smcibcq; cq = smcibcq; min = cq->load; - for (i = 0; i < smcibdev->num_cq_peer; i++) { + for (i = 0; i < smcibdev->num_cq; i++) { if (smcibcq[i].load < min) { cq = &smcibcq[i]; min = cq->load; @@ -685,27 +680,22 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) { if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq_send); - smc_ib_put_cq(lnk->smcibcq_recv); + smc_ib_put_cq(lnk->smcibcq); } lnk->roce_qp = NULL; - lnk->smcibcq_send = NULL; - lnk->smcibcq_recv = NULL; + lnk->smcibcq = NULL; } /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, - true); - struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, - false); + struct smc_ib_cq *smcibcq = smc_ib_get_least_used_cq(lnk->smcibdev); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq_send->ib_cq, - .recv_cq = smcibcq_recv->ib_cq, + .send_cq = smcibcq->ib_cq, + .recv_cq = smcibcq->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -735,8 +725,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = smcibcq_send; - lnk->smcibcq_recv = smcibcq_recv; + lnk->smcibcq = smcibcq; smc_wr_remember_qp_attr(lnk); } return rc; @@ -889,16 +878,12 @@ static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibdev->smcibcq_send[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); - - if (smcibdev->smcibcq_recv[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); + for (i = 0; i < smcibdev->num_cq; i++) { + if (smcibdev->smcibcq[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); } - kfree(smcibdev->smcibcq_send); - kfree(smcibdev->smcibcq_recv); + kfree(smcibdev->smcibcq); } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -906,7 +891,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; int cqe_size_order, smc_order; struct smc_ib_cq *smcibcq; - int i, num_cq_peer; + int i, num_cq; long rc; mutex_lock(&smcibdev->mutex); @@ -918,42 +903,22 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq_peer = num_cq_peer; - smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_send) { - rc = -ENOMEM; - goto err; - } - smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_recv) { + num_cq = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq = num_cq; + smcibdev->smcibcq = kcalloc(num_cq, sizeof(*smcibcq), GFP_KERNEL); + if (!smcibdev->smcibcq) { rc = -ENOMEM; goto err; } /* initialize CQs */ - for (i = 0; i < num_cq_peer; i++) { - /* initialize send CQ */ - smcibcq = &smcibdev->smcibcq_send[i]; + for (i = 0; i < num_cq; i++) { + smcibcq = &smcibdev->smcibcq[i]; smcibcq->smcibdev = smcibdev; - smcibcq->is_send = 1; cqattr.comp_vector = i; smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) - goto err; - - /* initialize recv CQ */ - smcibcq = &smcibdev->smcibcq_recv[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, + smc_wr_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); if (IS_ERR(smcibcq->ib_cq)) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 456d59670031..62f4e5619147 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -36,7 +36,6 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct smc_ib_device *smcibdev; /* parent ib device */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ - bool is_send; /* send for recv cq */ int load; /* load of current cq */ }; @@ -45,9 +44,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq_peer; /* num of snd/rcv cq peer */ - struct smc_ib_cq *smcibcq_send; /* send cqs */ - struct smc_ib_cq *smcibcq_recv; /* recv cqs */ + int num_cq; /* num of snd/rcv cq */ + struct smc_ib_cq *smcibcq; /* send & recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 4df29a0fafd2..cb8bd0e04cb4 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -134,42 +134,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) wake_up(&link->wr_tx_wait); } -static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) -{ - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); - struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc; - -again: - do { - memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) - smc_wr_tx_process_cqe(&wc[i]); - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; - } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) - goto again; -} - -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) -{ - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - - tasklet_schedule(&smcibcq->tasklet); -} - /*---------------------------- request submission ---------------------------*/ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) @@ -330,7 +294,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -374,7 +338,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -483,7 +447,19 @@ static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) } } -static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) +int smc_wr_rx_post_init(struct smc_link *link) +{ + u32 i; + int rc = 0; + + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); + return rc; +} + +static void smc_wr_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; @@ -493,8 +469,21 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) - smc_wr_rx_process_cqe(&wc[i]); + for (i = 0; i < rc; i++) { + switch (wc[i].opcode) { + case IB_WC_REG_MR: + case IB_WC_SEND: + smc_wr_tx_process_cqe(&wc[i]); + break; + case IB_WC_RECV: + smc_wr_rx_process_cqe(&wc[i]); + break; + default: + pr_warn("smc: unexpected wc opcode %d, status %d, wr_id %llu.\n", + wc[i].opcode, wc[i].status, wc[i].wr_id); + break; + } + } if (rc < SMC_WR_MAX_POLL_CQE) /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been * drained, no need to poll again. @@ -512,25 +501,13 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) goto again; } -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) { struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; tasklet_schedule(&smcibcq->tasklet); } -int smc_wr_rx_post_init(struct smc_link *link) -{ - u32 i; - int rc = 0; - - for (i = 0; i < link->wr_rx_cnt; i++) - rc = smc_wr_rx_post(link); - // credits have already been announced to peer - atomic_set(&link->local_rq_credits, 0); - return rc; -} - /***************************** init, exit, misc ******************************/ void smc_wr_remember_qp_attr(struct smc_link *lnk) @@ -850,21 +827,17 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); - tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); - } + for (i = 0; i < smcibdev->num_cq; i++) + tasklet_kill(&smcibdev->smcibcq[i].tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, - smc_wr_tx_tasklet_fn); - tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, - smc_wr_rx_tasklet_fn); + for (i = 0; i < smcibdev->num_cq; i++) { + tasklet_setup(&smcibdev->smcibcq[i].tasklet, + smc_wr_tasklet_fn); } } diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 5b671065afdc..ce338e1ca6c2 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -187,12 +187,11 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); #endif /* SMC_WR_H */ -- Gitee From 070819dc2ca6f69d1311945339c4d0c180af0ede Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 15:24:44 +0800 Subject: [PATCH 68/95] anolis: net/smc: remove redundant ib_req_notify_cq ANBZ: #1742 Solicited flag is only used by RCQ. As SCQ and RCQ are combined into one CQ, we can not notify cq with solicited flag. And immediately after the solicited notify cq, another notify with next complete flag is performed, the state machine of CQ will also immediately switch from the Arm_Sol state to the Armed state, which is the same as the result of direct notify with next complete flag. So the code of notify CQ with solicited is redundant and meaningless. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 6d54861b2a3f..485041bfd0d4 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,11 +131,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, - IB_CQ_SOLICITED_MASK); - if (rc) - goto out; - rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) -- Gitee From 22fb8608c2082db3425c73774ef230e8203e11fc Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 9 Jun 2022 09:48:43 +0800 Subject: [PATCH 69/95] anolis: net/smc: poll_cq one more time if the polled cqe is less than SMC_WR_MAX_POLL_CQE ANBZ: #1742 notify cq with IB_CQ_REPORT_MISSED_EVENTS flag, rdma driver will return positive value if the cq is not empty, and will always arm cq regardless of whether the cq is empty or not. Once arm cq when cq is not empty, cq interrupt will be generated event though cq has been drained out after arm. Thus, if new cqe is generated between cq drained out and arm cq, SMC-R will get positive value when ib_req_notify_cq and goto poll cq and drain cqe again, and cq interrupt is useless in such condition. In nginx + wrk benchmark, about 10% of the cq interrupts are useless cq interrupts. Poll cq one more time if the polled cqe is less than SMC_WR_MAX_POLL_CQE can reduce the useless cq interrupts from 10% to 1%. Nginx benchmark shows 7.5% improvement in throughput: Server test command: smc_run nginx Client test command: smc_run /opt/wrk/wrk http://ip:port -t 32 -c 992 -d 30 --latency Before: Requests/sec: 1983511.11 Transfer/sec: 304.55MB After: Requests/sec: 2133148.49 Transfer/sec: 327.53MB Fixes: f49d6eda516f (net/smc: don't req_notify until all CQEs drained) Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index cb8bd0e04cb4..a246e3bb9a4c 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -484,11 +484,6 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) break; } } - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; } while (rc > 0); /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, -- Gitee From b3dd0e9d17d6ff905051a55b648f5587675c731f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 6 Jun 2022 10:52:43 +0800 Subject: [PATCH 70/95] anolis: net/smc: introduce 1RTT to SMC-R ANBZ: #1742 SMC-R 1rtt is currently an internal version. In order to be compatible with subsequent community versions, sysctl is used here, and it is turned on by default. We have noticed that single network interface card is mainstream on the cloud, dues to the advantages of cloud deployment costs and the cloud's own disaster recovery support. On the other hand, the emergence of RoCE LAG technology makes us no longer need to deal with multiple RDMA network interface cards by ourselves, just like NIC bonding does. In Alibaba, Roce LAG is widely used for RDMA. In that case, SMC-R have only one single link, if so, the RKEY LLC messages that to perform information exchange in all links are no longer needed, the SMC Proposal & accept has already complete the exchange of all information needed. So we think that we can remove the RKEY exchange in that case, which will save us 2-RTT over IB. We call it as SMC-R 2-RTT. We can use TCP fast open, carry the SMC proposal data by TCP SYN message, reduce the time that the SMC waits for the TCP connection to be established. This will save us another 1-RTT over IP. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 3 +++ net/smc/af_smc.c | 35 +++++++++++++++++++++++++++++++---- net/smc/smc.h | 3 +++ net/smc/smc_sysctl.c | 21 +++++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 2022b6a9b745..debc45ab2c49 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -32,5 +32,8 @@ struct netns_smc { int sysctl_allow_different_subnet; int sysctl_keep_first_contact_clcsock; int sysctl_disable_multiple_link; + /* allow simplify rkey exchange when single link */ + unsigned int sysctl_simplify_rkey_exhcange; + unsigned int sysctl_smc_fastopen; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f5e47f8f5ab1..398f12b8bc92 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -70,6 +70,15 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +static inline int smc_clcsock_enable_fastopen(struct smc_sock *smc, int is_server) +{ + int val = 1; + + return smc->clcsock->ops->setsockopt(smc->clcsock, SOL_TCP, + is_server ? TCP_FASTOPEN : TCP_FASTOPEN_CONNECT, + KERNEL_SOCKPTR(&val), sizeof(val)); +} + int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -420,6 +429,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); + /* default behavior from every net namespace */ + smc->simplify_rkey_exhcange = net->smc.sysctl_simplify_rkey_exhcange; + smc->smc_fastopen = net->smc.sysctl_smc_fastopen; + return sk; } @@ -540,9 +553,10 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, } /* register the new rmb on all links */ -static int smcr_lgr_reg_rmbs(struct smc_link *link, +static int smcr_lgr_reg_rmbs(struct smc_sock *smc, struct smc_buf_desc *rmb_desc) { + struct smc_link *link = smc->conn.lnk; struct smc_link_group *lgr = link->lgr; int i, lnk = 0, rc = 0; @@ -564,7 +578,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, } /* do not exchange confirm_rkey msg since there are only one link */ - if (lnk > 1) { + if (lnk > 1 || !smc->simplify_rkey_exhcange) { /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { @@ -1312,7 +1326,7 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } - if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { + if (smcr_lgr_reg_rmbs(smc, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } @@ -1577,6 +1591,11 @@ static void smc_connect_work(struct work_struct *work) if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; + + if (smc->smc_fastopen && + inet_sk(smc->clcsock->sk)->defer_connect) + goto defer_connect; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; @@ -1589,6 +1608,7 @@ static void smc_connect_work(struct work_struct *work) rc = 0; } release_sock(smc->clcsock->sk); +defer_connect: lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; @@ -1675,6 +1695,10 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, rc = -EALREADY; goto out; } + + if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is_server */ 0)) + smc->smc_fastopen = 0; /* rollback when setsockopt failed */ + rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; @@ -2226,7 +2250,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) conn->sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } - if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) + if (smcr_lgr_reg_rmbs(new_smc, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; } @@ -2604,6 +2628,9 @@ static int smc_listen(struct socket *sock, int backlog) if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is server */ 1)) + smc->smc_fastopen = 0; /* rollback when setsockopt failed */ + rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 578853227e46..f133a19c6ca5 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -261,6 +261,9 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ + bool simplify_rkey_exhcange; /* simplify rkey exchange */ + /* enable SMC-R handshake proposal via tcp fastopen */ + bool smc_fastopen; bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index f846b2e8e765..170ac42485d2 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -100,6 +100,24 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "simplify_rkey_exhcange", + .data = &init_net.smc.sysctl_simplify_rkey_exhcange, + .maxlen = sizeof(init_net.smc.sysctl_simplify_rkey_exhcange), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "fastopen", + .data = &init_net.smc.sysctl_smc_fastopen, + .maxlen = sizeof(init_net.smc.sysctl_smc_fastopen), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -131,6 +149,9 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_allow_different_subnet = 1; net->smc.sysctl_keep_first_contact_clcsock = 1; net->smc.sysctl_disable_multiple_link = 1; + /* default on */ + net->smc.sysctl_simplify_rkey_exhcange = 1; + net->smc.sysctl_smc_fastopen = 1; return 0; -- Gitee From 441e4453e089b69ad2cbb106dccd9c9900e9b3d7 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:27:13 +0800 Subject: [PATCH 71/95] anolis: net/smc: Introduce rdma dim for smc ANBZ: #1742 Dynamic interrupt moderation can coalesce interrupts and reduce cpu utilization. In erdma environment with software interrupt moderation, which only has software timer for interrupt moderation. Nginx benchmark shows 13.7% improvement in throughput: Server test command: smc_run nginx Client test command: smc_run /opt/wrk/wrk http://ip:port -t 32 -c 992 -d 30 --latency Before: Requests/sec: 2467285.48 After: Requests/sec: 2804945.57 In CX4 environment with hardware interrupt moderation, Nginx benchmark shows 39.5% improvement in throughput: Before: Requests/sec: 1260906.32 After: Requests/sec: 1758493.18 Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_ib.h | 1 + net/smc/smc_wr.c | 8 ++++++- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 485041bfd0d4..6574a7cb1924 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -869,13 +869,69 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } +static const struct dim_cq_moder +smc_dim_profile[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +static void smc_ib_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; + + u16 usec = smc_dim_profile[dim->profile_ix].usec; + u16 comps = smc_dim_profile[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void smc_ib_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim) + return; + + dim = kzalloc(sizeof(*dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, smc_ib_dim_work); +} + +static void smc_ib_dim_destroy(struct ib_cq *cq) +{ + if (!cq->dim) + return; + + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); +} + static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { int i; for (i = 0; i < smcibdev->num_cq; i++) { - if (smcibdev->smcibcq[i].ib_cq) + if (smcibdev->smcibcq[i].ib_cq) { + smc_ib_dim_destroy(smcibdev->smcibcq[i].ib_cq); ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); + } } kfree(smcibdev->smcibcq); @@ -918,6 +974,8 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); if (IS_ERR(smcibcq->ib_cq)) goto err; + + smc_ib_dim_init(smcibcq->ib_cq); } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 62f4e5619147..906a6c57cac3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index a246e3bb9a4c..84f8b9f3f363 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -463,7 +463,7 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc; + int i, rc, completed = 0; again: do { @@ -484,6 +484,9 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) break; } } + + if (rc > 0) + completed += rc; } while (rc > 0); /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, @@ -494,6 +497,9 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; + + if (smcibcq->ib_cq->dim) + rdma_dim(smcibcq->ib_cq->dim, completed); } void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) -- Gitee From c29c699104397fc6e79a6f22a82c1d1c789ed72b Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:30:53 +0800 Subject: [PATCH 72/95] anolis: net/smc: clear ib_cq errno when ib_create_cq failed ANBZ: #1742 When ib_create_cq failed, ib_create_cq will return errno stored in ib_cq ptr, and then smc_ib_cleanup_cq will be called to destroy cq. The ib_cq ptr, where stores errno will pass to ib_destroy_cq, which will cause kernel crash. Clear errno in ib_cq ptr to NULL, when ib_create_cq failed. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 6574a7cb1924..19b3e6ae6b20 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -972,8 +972,10 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_wr_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) + if (IS_ERR(smcibcq->ib_cq)) { + smcibcq->ib_cq = NULL; goto err; + } smc_ib_dim_init(smcibcq->ib_cq); } -- Gitee From 8334b834cbb511de08313f88edb80952c38ccedf Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:31:02 +0800 Subject: [PATCH 73/95] anolis: net/smc: remove redundant ib_req_notify_cq ANBZ: #1742 ib_req_notify_cq after ib_cq created and every cq event processed. Other ib_req_notify_cq is redundant. This patch also improves connecting performance, as ib_req_notify_cq in connecting process and in smc_wr_tasklet_fn has lock competition. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 7 +++---- net/smc/smc_wr.c | 4 ---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 19b3e6ae6b20..b922009bf6b8 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,10 +131,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc) - goto out; rc = smc_wr_rx_post_init(lnk); if (rc) @@ -978,6 +974,9 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) } smc_ib_dim_init(smcibcq->ib_cq); + rc = ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP); + if (rc) + goto err; } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 84f8b9f3f363..978975547aef 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -294,8 +294,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); @@ -338,8 +336,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; link->wr_reg.mr = mr; -- Gitee From 5e73deb7aa45e8195fb4a0e13e34ea259b6efcb0 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:31:10 +0800 Subject: [PATCH 74/95] anolis: net/smc: remove solicited flag for wr send ANBZ: #1742 In smc, CQ state machine will not turn to arm_sol, solicited flag is useless and harmful for rdma dim. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 978975547aef..08b310ff5db6 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -563,8 +563,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; - lnk->wr_tx_ibs[i].send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED; if (send_inline) lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; @@ -584,8 +583,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; lnk->wr_tx_v2_ib->num_sge = 1; lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; - lnk->wr_tx_v2_ib->send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_v2_ib->send_flags = IB_SEND_SIGNALED; } /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. -- Gitee From 045b3b41ace155a5940de6324da3caf461d69de5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 21 Jun 2022 11:18:55 +0800 Subject: [PATCH 75/95] anolis: net/smc: Fix potential leaks on queued_smc_hs ANBZ: #1742 The following potential scenarios could cause leaks: atomic_inc(&lsmc->queued_smc_hs); ... smc_listen_out_err __smc_lgr_terminate smc_conn_kill switch sk.sk_state case SMC_INIT: break; sock_set_flag(sk, SOCK_DEAD); smc_close_passive_work old_state = SMC_INIT if (sk_state == SMC_INIT) sk_state = SMC_APPCLOSEWAIT1; sk_state=SMC_CLOSED old_state != sk_state; sk_state == SMC_CLOSED; sock_flag(sk, SOCK_DEAD) smc_clcsock_release() clcsock = NULL if (clcsock ...) atomic_dec(&lsmc->queued_smc_hs) Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 11 ++++++----- net/smc/smc.h | 5 +++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 398f12b8bc92..bc7404a3dee2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1922,11 +1922,8 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - mutex_lock(&new_smc->clcsock_release_lock); - if (new_smc->clcsock && new_smc->clcsock->sk && - tcp_sk(new_smc->clcsock->sk)->syn_smc) + if (new_smc->smc_negotiated) atomic_dec(&lsmc->queued_smc_hs); - mutex_unlock(&new_smc->clcsock_release_lock); if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); @@ -2542,8 +2539,12 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) { + new_smc->smc_negotiated = 1; atomic_inc(&lsmc->queued_smc_hs); + /* memory barrier */ + smp_mb__after_atomic(); + } new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; diff --git a/net/smc/smc.h b/net/smc/smc.h index f133a19c6ca5..39b9caa0d783 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -280,6 +280,11 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 smc_negotiated : 1; + /* whether the smc_sock + * was successfully negotiated + * via TCP options. + */ u8 connect_nonblock : 1; /* non-blocking connect in * flight -- Gitee From 9475a424e2ab5db393ceedf49d9c016a7e647454 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 24 Jun 2022 17:33:44 +0800 Subject: [PATCH 76/95] anolis: net/smc: fix deadlock when lgr terminating ANBZ: #1742 A potential deadlock may occur in the following scenarios: smc_close __smc_lgr_terminate lock_sock lock_sock smc_conn_kill smc_cdc_wait_pend_tx_wr wait_event(cdc_pend_tx_wq) Once SMC link is not available, pending CDC may not receive their corresponding cqe. Hence, a potential deadlock has occurred. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 12 ++++++++---- net/smc/smc_core.c | 4 ++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 25b836df9f50..4a5b4f1f24b3 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -136,7 +136,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_wr_rx_put_credits(link, saved_credits); - atomic_dec(&conn->cdc_pend_tx_wr); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) || smc_link_usable(conn->lnk)) + wake_up(&conn->cdc_pend_tx_wq); } return rc; @@ -168,8 +169,10 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (unlikely(rc)) - atomic_dec(&conn->cdc_pend_tx_wr); + if (unlikely(rc)) { + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) || smc_link_usable(conn->lnk)) + wake_up(&conn->cdc_pend_tx_wq); + } return rc; } @@ -230,7 +233,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) { - wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); + wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr) || + !smc_link_usable(conn->lnk) || conn->lgr->terminating); } /* Send a SMC-D CDC header. diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2ac4dfdfa240..edd86332a78b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1482,6 +1482,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) /* cancel free_work sync, will terminate when lgr->freeing is set */ cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; + /* memory barrier */ + smp_wmb(); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -1491,6 +1493,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put below */ + /* try wakeup all */ + wake_up_all(&conn->cdc_pend_tx_wq); lock_sock(&smc->sk); smc_conn_kill(conn, soft); release_sock(&smc->sk); -- Gitee From 541a40a698828c4e83ede97fecc0a7666f789d41 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 29 Jun 2022 20:38:08 +0800 Subject: [PATCH 77/95] anolis: net/smc: Use diff TCP EXPR MAGIC to avoid network middleware do simple echo ANBZ: #1742 We found some network middleware will echo unknows TCP options, which will confuse SMC client. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 ++ include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 21 ++++++++++++++++----- net/ipv4/tcp_output.c | 14 +++++++++++++- net/smc/smc_sysctl.c | 12 +++++++++++- 5 files changed, 44 insertions(+), 7 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index debc45ab2c49..5a888a25f527 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -35,5 +35,7 @@ struct netns_smc { /* allow simplify rkey exchange when single link */ unsigned int sysctl_simplify_rkey_exhcange; unsigned int sysctl_smc_fastopen; + /* use diff TCP experiment magic code */ + unsigned int sysctl_smc_experiments; }; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 94532793107f..f791206af891 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -195,6 +195,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 +/* "SMCO" in EBCDIC encoding */ +#define TCPOPT_SMC_OK_MAGIC 0xE2D4C3D6 /* * TCP option lengths diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b8227e6a78bd..f524a5f8f552 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3903,15 +3903,26 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, + const struct net *net, int opsize) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && - opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { - opt_rx->smc_ok = 1; - return true; + opsize >= TCPOLEN_EXP_SMC_BASE) { + /* syn ack */ + if (th->ack && net->smc.sysctl_smc_experiments) { + if (get_unaligned_be32(ptr) == TCPOPT_SMC_OK_MAGIC) { + opt_rx->smc_ok = 1; + return true; + } + return false; + } + /* syn only */ + if (get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { + opt_rx->smc_ok = 1; + return true; + } } } #endif @@ -4074,7 +4085,7 @@ void tcp_parse_options(const struct net *net, break; } - if (smc_parse_options(th, opt_rx, ptr, opsize)) + if (smc_parse_options(th, opt_rx, ptr, net, opsize)) break; opt_rx->saw_unknown = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 08f059a285f7..13a562ae567f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -416,6 +416,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) #define OPTION_MPTCP (1 << 10) +#define OPTION_SMC_OK BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -427,6 +428,12 @@ static void smc_options_write(__be32 *ptr, u16 *options) (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } else if (OPTION_SMC_OK & *options) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_OK_MAGIC); } } #endif @@ -726,10 +733,15 @@ static void smc_set_option_cond(const struct tcp_sock *tp, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) + const struct sock *sk; + + sk = &tp->inet_conn.icsk_inet.sk; + if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; + opts->options |= sock_net(sk)->smc.sysctl_smc_experiments ? + OPTION_SMC_OK : OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 170ac42485d2..09c585c69e70 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -118,6 +118,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "sysctl_smc_experiments", + .data = &init_net.smc.sysctl_smc_experiments, + .maxlen = sizeof(init_net.smc.sysctl_smc_experiments), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -152,7 +161,8 @@ int __net_init smc_sysctl_net_init(struct net *net) /* default on */ net->smc.sysctl_simplify_rkey_exhcange = 1; net->smc.sysctl_smc_fastopen = 1; - + /* default off */ + net->smc.sysctl_smc_experiments = 0; return 0; err_reg: -- Gitee From f7bc6b14fe98ff0d30d45ae18dc11ebdbb8221cb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:04:24 +0800 Subject: [PATCH 78/95] anolis: net/smc: Change listen wq to unbound highpri wq ANBZ: #1742 Change listen wq to unbound and highpri wq. Signed-off-by: D. Wythe Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bc7404a3dee2..30aac93aeb26 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3491,7 +3491,7 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!smc_tcp_ls_wq) goto out_pnet; -- Gitee From 156c4aba552bea4067c7c2ea1e57e6815ae44360 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:04:45 +0800 Subject: [PATCH 79/95] anolis: net/smc: remove useless path ANBZ: #1742 Shorten the fallback processing path Signed-off-by: D. Wythe Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 30aac93aeb26..06b17d94d3fb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2424,16 +2424,6 @@ static void smc_listen_work(struct work_struct *work) return; } - /* check if peer is smc capable */ - if (!tcp_sk(newclcsock->sk)->syn_smc) { - rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - if (rc) - smc_listen_out_err(new_smc); - else - smc_listen_out_connected(new_smc); - return; - } - /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ @@ -2539,13 +2529,6 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) { - new_smc->smc_negotiated = 1; - atomic_inc(&lsmc->queued_smc_hs); - /* memory barrier */ - smp_mb__after_atomic(); - } - new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2554,9 +2537,26 @@ static void smc_tcp_listen_work(struct work_struct *work) smc_copy_sock_settings_to_smc(new_smc); new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; - sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) - sock_put(&new_smc->sk); + + /* check if peer is smc capable */ + if (!tcp_sk(new_smc->clcsock->sk)->syn_smc) { + release_sock(lsk); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); + lock_sock(lsk); + } else { + new_smc->smc_negotiated = 1; + atomic_inc(&lsmc->queued_smc_hs); + /* memory barrier */ + smp_mb__after_atomic(); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) + sock_put(&new_smc->sk); + } } out: -- Gitee From 8c2701ab347dde44f87ce7919cd4b8345fe9baaf Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 21 Jul 2022 11:05:04 +0800 Subject: [PATCH 80/95] anolis: net/smc: queue free_work to smc_close_wq instead of smc_hs_wq ANBZ: #1742 Queue free_work to smc_close_wq instead of smc_hs_wq. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 06b17d94d3fb..80d804435db6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -310,7 +310,7 @@ static int __smc_release(struct smc_sock *smc) if (sk->sk_state == SMC_CLOSED) { sock_hold(sk); - if (!queue_work(smc_hs_wq, &smc->free_work)) + if (!queue_work(smc_close_wq, &smc->free_work)) sock_put(sk); } -- Gitee From 4b72d4d4eb32bc7aced17f0a7fcf1c434beb58be Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:06:46 +0800 Subject: [PATCH 81/95] anolis: net/smc: do not use free work if fallback ANBZ: #1742 Do not use free work if fallback to shorten fallback smc_release process. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 80d804435db6..a0a38de34849 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -309,9 +309,17 @@ static int __smc_release(struct smc_sock *smc) sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { - sock_hold(sk); - if (!queue_work(smc_close_wq, &smc->free_work)) - sock_put(sk); + if (smc->clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } + + if (!smc->use_fallback) { + sock_hold(sk); + if (!queue_work(smc_close_wq, &smc->free_work)) + sock_put(sk); + } } return rc; @@ -382,16 +390,8 @@ static void smc_free_work(struct work_struct *work) sk = &smc->sk; lock_sock(sk); - if (sk->sk_state == SMC_CLOSED) { - if (smc->clcsock) { - release_sock(sk); - smc_clcsock_release(smc); - lock_sock(sk); - } - - if (!smc->use_fallback) - smc_conn_free(&smc->conn); - } + if (sk->sk_state == SMC_CLOSED && !smc->use_fallback) + smc_conn_free(&smc->conn); release_sock(sk); sock_put(sk); /* before queue */ -- Gitee From d950131438d0321f9762b8e44930c3e842f2a138 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:07:31 +0800 Subject: [PATCH 82/95] anolis: net/smc: only add wait queue when smc_accept_dequeue get null ANBZ: #1742 Only add wait queue when smc_accept_dequeue get null. And change wait queue api from add_wait_queue_exclusive to prepare_to_wait_exclusive as inet_csk_wait_for_connect did. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a0a38de34849..c8d0419fb770 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2654,9 +2654,10 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *sk = sock->sk, *nsk; - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct smc_sock *lsmc; long timeo; + bool waited = false; int rc = 0; lsmc = smc_sk(sk); @@ -2669,15 +2670,16 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, goto out; } - /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { - set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } + /* Wait for an incoming connection */ + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + waited = true; release_sock(sk); timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ @@ -2688,8 +2690,9 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, break; } } - set_current_state(TASK_RUNNING); - remove_wait_queue(sk_sleep(sk), &wait); + + if (waited) + finish_wait(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); -- Gitee From 037505f7c32599c2a088207178657dfb4504d883 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:07:39 +0800 Subject: [PATCH 83/95] anolis: net/smc: double check whether accept queue is empty before schedule_timeout ANBZ: #1742 Double check whether accept queue is empty before schedule_timeout. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c8d0419fb770..85b5b53041e7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1814,6 +1814,11 @@ static void smc_accept_unlink(struct sock *sk) sock_put(sk); /* sock_hold in smc_accept_enqueue */ } +static inline bool smc_accept_queue_empty(struct sock *sk) +{ + return list_empty(&smc_sk(sk)->accept_q); +} + /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ @@ -2681,7 +2686,8 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, TASK_INTERRUPTIBLE); waited = true; release_sock(sk); - timeo = schedule_timeout(timeo); + if (smc_accept_queue_empty(sk)) + timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); -- Gitee From 16d5a1f7b5511aa2faed624eb209fbb01d1e4aa7 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:01 +0800 Subject: [PATCH 84/95] anolis: net/smc: optimize for smc_accept_poll ANBZ: #1742 It is no need to lock accept_q_lock when checking accept_q is empty or not. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 85b5b53041e7..c12b28f63b1d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2815,17 +2815,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return rc; } -static __poll_t smc_accept_poll(struct sock *parent) +static inline __poll_t smc_accept_poll(struct sock *parent) { - struct smc_sock *isk = smc_sk(parent); - __poll_t mask = 0; - - spin_lock(&isk->accept_q_lock); - if (!list_empty(&isk->accept_q)) - mask = EPOLLIN | EPOLLRDNORM; - spin_unlock(&isk->accept_q_lock); + if (!smc_accept_queue_empty(parent)) + return EPOLLIN | EPOLLRDNORM; - return mask; + return 0; } static __poll_t smc_poll(struct file *file, struct socket *sock, -- Gitee From fde3898c4a1a463a1867d193253edab549cc1cc8 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 13:02:22 +0800 Subject: [PATCH 85/95] anolis: net/smc: remove sock lock in smc_tcp_listen_work ANBZ: #1742 Since lsk is held and will not be freed, and lsk is used to read some fields in it, it is no need to lock lsk in smc_tcp_listen_work. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c12b28f63b1d..703d39158653 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1735,13 +1735,11 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; *new_smc = NULL; - lock_sock(lsk); goto out; } *new_smc = smc_sk(new_sk); @@ -1750,7 +1748,6 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); - lock_sock(lsk); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { @@ -2526,7 +2523,6 @@ static void smc_tcp_listen_work(struct work_struct *work) struct smc_sock *new_smc; int rc = 0; - lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ @@ -2545,14 +2541,12 @@ static void smc_tcp_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(new_smc->clcsock->sk)->syn_smc) { - release_sock(lsk); sock_hold(&new_smc->sk); /* sock_put in passive closing */ rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); if (rc) smc_listen_out_err(new_smc); else smc_listen_out_connected(new_smc); - lock_sock(lsk); } else { new_smc->smc_negotiated = 1; atomic_inc(&lsmc->queued_smc_hs); @@ -2565,7 +2559,6 @@ static void smc_tcp_listen_work(struct work_struct *work) } out: - release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -- Gitee From c8f3278e6425870716df3f7faf898c826ddabe0f Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 13:02:34 +0800 Subject: [PATCH 86/95] anolis: net/smc: move sk_acceptq_{removed,add} into accept_q_lock's protection ANBZ: #1742 Move sk_acceptq_{removed,add} into accept_q_lock's protection. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 703d39158653..01a2dc1c8ef8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1795,8 +1795,8 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); - spin_unlock(&par->accept_q_lock); sk_acceptq_added(parent); + spin_unlock(&par->accept_q_lock); } /* remove a socket from the accept queue of its parental listening socket */ @@ -1806,8 +1806,8 @@ static void smc_accept_unlink(struct sock *sk) spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); - spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + spin_unlock(&par->accept_q_lock); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } -- Gitee From 2d8cafdd72bba7b559bf0e3c155372f34e92ad44 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 12:09:16 +0800 Subject: [PATCH 87/95] anolis: net/smc: smc_sock_alloc after kernel_accept ANBZ: #1742 Execution of smc_sock_alloc and free sock is a waste of CPU when kernel_accept fails. As the success probability of smc sock_alloc is higher than that of kernel_accept, it is more reasonable to first kernel_accept and then smc_sock_alloc. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 01a2dc1c8ef8..2e58d7ea9bd2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1735,33 +1735,22 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); - if (!new_sk) { - rc = -ENOMEM; - lsk->sk_err = ENOMEM; - *new_smc = NULL; - goto out; - } - *new_smc = smc_sk(new_sk); - mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; - if (rc < 0 || lsk->sk_state == SMC_CLOSED) { - new_sk->sk_prot->unhash(new_sk); - mutex_lock(&lsmc->clcsock_release_lock); - if (new_clcsock) - sock_release(new_clcsock); - new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); - mutex_unlock(&lsmc->clcsock_release_lock); - sock_put(new_sk); /* final */ - *new_smc = NULL; - goto out; + if (rc < 0 || lsk->sk_state == SMC_CLOSED) + goto err_out; + + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); + if (!new_sk) { + rc = -ENOMEM; + lsk->sk_err = ENOMEM; + goto err_out; } + *new_smc = smc_sk(new_sk); /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function @@ -1781,7 +1770,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) } (*new_smc)->clcsock = new_clcsock; -out: + + return 0; +err_out: + *new_smc = NULL; + if (new_clcsock) + sock_release(new_clcsock); return rc; } -- Gitee From 4104d5ebc850b1001a49a05c2530b0bb5f16def3 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:25 +0800 Subject: [PATCH 88/95] anolis: net/smc: Introduce multiple tcp listen works to enhance tcp_listen_work ANBZ: #1742 Introduce multiple tcp listen works to enhance tcp_listen_work, as each tcp listen work can be enqueued independently to workqueue and can be executed concurrently. Since kernel_accept cannot accept concurrently, too many tcp listen works will only lead to excessive kernel_accept competition and waste CPU, the number of the tcp listen works is now set to 2, which has been tested to be the best performance. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 16 ++++++++++++---- net/smc/smc.h | 11 ++++++++++- net/smc/smc_close.c | 4 +++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2e58d7ea9bd2..8a28de50e413 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -403,6 +403,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, struct smc_sock *smc; struct proto *prot; struct sock *sk; + int i = 0; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); @@ -417,7 +418,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); smc->keep_clcsock = 0; - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { + smc->tcp_listen_works[i].smc = smc; + INIT_WORK(&smc->tcp_listen_works[i].work, smc_tcp_listen_work); + } + atomic_set(&smc->tcp_listen_work_seq, 0); INIT_WORK(&smc->free_work, smc_free_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -2511,8 +2516,9 @@ static void smc_listen_work(struct work_struct *work) static void smc_tcp_listen_work(struct work_struct *work) { - struct smc_sock *lsmc = container_of(work, struct smc_sock, - tcp_listen_work); + struct smc_tcp_listen_work *twork = + container_of(work, struct smc_tcp_listen_work, work); + struct smc_sock *lsmc = twork->smc; struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; @@ -2566,8 +2572,10 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { + int idx = atomic_fetch_inc(&lsmc->tcp_listen_work_seq) % + SMC_MAX_TCP_LISTEN_WORKS; sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_works[idx].work)) sock_put(&lsmc->sk); } out: diff --git a/net/smc/smc.h b/net/smc/smc.h index 39b9caa0d783..b910fcfc4622 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -240,6 +240,13 @@ struct smc_connection { u8 out_of_sync : 1; /* out of sync with peer */ }; +#define SMC_MAX_TCP_LISTEN_WORKS 2 + +struct smc_tcp_listen_work { + struct smc_sock *smc; + struct work_struct work; +}; + struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ @@ -255,7 +262,9 @@ struct smc_sock { /* smc sock container */ struct smc_sock *listen_smc; /* listen parent */ bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ - struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct smc_tcp_listen_work tcp_listen_works[SMC_MAX_TCP_LISTEN_WORKS]; + /* handle tcp socket accepts */ + atomic_t tcp_listen_work_seq;/* used to select tcp_listen_works */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct work_struct free_work; /* free smc conn */ struct list_head accept_q; /* sockets to be accepted */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 038bcafe9a9e..f2ab91faa96d 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -201,6 +201,7 @@ int smc_close_active(struct smc_sock *smc) long timeout; int rc = 0; int rc1 = 0; + int i = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? @@ -225,7 +226,8 @@ int smc_close_active(struct smc_sock *smc) } smc_close_cleanup_listen(sk); release_sock(sk); - flush_work(&smc->tcp_listen_work); + for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) + flush_work(&smc->tcp_listen_works[i].work); lock_sock(sk); break; case SMC_ACTIVE: -- Gitee From a8ffe36bc6ca395e604417eeba7b40ab56662512 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:35 +0800 Subject: [PATCH 89/95] anolis: net/smc: only cancel connect_work when connect nonblock ANBZ: #1742 In smc_release, only cancel connect_work when connect nonblock, as connect whithout NON_BLOCKING flag will not queue connect work. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8a28de50e413..7c06c90601c2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -343,7 +343,7 @@ static int smc_release(struct socket *sock) if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - if (cancel_work_sync(&smc->connect_work)) + if (smc->connect_nonblock && cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) -- Gitee From 10f5ae96b70f65854d1758dafb8b4db6b479a616 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:42 +0800 Subject: [PATCH 90/95] anolis: net/smc: do not call cancel smc_listen_work when smc use fallback ANBZ: #1742 In smc_clcsock_release, do not call cancel smc_listen_work when smc use fallback, as fallback smc will not queue smc_listen_work to workqueue. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_close.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index f2ab91faa96d..c6b550930182 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -25,7 +25,8 @@ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; - if (smc->listen_smc && current_work() != &smc->smc_listen_work) + if (smc->listen_smc && !smc->use_fallback && + current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); /* don't release clcsock for eRDMA */ -- Gitee From eb3512fedfc0e9e6b2dfe8b66de45a7676359c0c Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:49 +0800 Subject: [PATCH 91/95] anolis: net/smc: check sk_ack_backlog before kernel_accept ANBZ: #1742 If sock accept queue is empty, kernel_accept will sock_create_lite and quickly then sock_release, which is a waste of cpu. As sk_ack_backlog can indicate the accept queue's length, check the depth of the accept queue through sk_ack_backlog to decide whether to call kernel_accept, which can avoid the aforementioned waste of CPU. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7c06c90601c2..afa93c3b7ff5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1741,8 +1741,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) int rc = -EINVAL; mutex_lock(&lsmc->clcsock_release_lock); - if (lsmc->clcsock) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + if (lsmc->clcsock) { + if (lsmc->clcsock->sk->sk_ack_backlog) + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + else + rc = -EAGAIN; + } mutex_unlock(&lsmc->clcsock_release_lock); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; -- Gitee From 8d40e6e59fa4564f9e25057dba4cecd134c4d7ca Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:56 +0800 Subject: [PATCH 92/95] anolis: net/smc: change clcsock_release_lock from mutex to rw_semaphore ANBZ: #1742 The lock names clcsock_release_lock is used to protect smc->clcsock from being released when in use. Since smc->clcsock is only released when smc_release and most of the access of smc->clcsock, replacing the lock to rw_semaphore is better than mutex. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 26 ++++++++++++++------------ net/smc/smc.h | 2 +- net/smc/smc_clc.c | 6 +++--- net/smc/smc_close.c | 4 ++-- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index afa93c3b7ff5..e7800d75c41f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -431,7 +431,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); - mutex_init(&smc->clcsock_release_lock); + init_rwsem(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); /* default behavior from every net namespace */ @@ -924,7 +924,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; @@ -947,7 +947,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc_fback_replace_callbacks(smc); } out: - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return rc; } @@ -1740,14 +1740,14 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - mutex_lock(&lsmc->clcsock_release_lock); + down_read(&lsmc->clcsock_release_lock); if (lsmc->clcsock) { if (lsmc->clcsock->sk->sk_ack_backlog) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); else rc = -EAGAIN; } - mutex_unlock(&lsmc->clcsock_release_lock); + up_read(&lsmc->clcsock_release_lock); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) @@ -1834,10 +1834,12 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); + down_write(&isk->clcsock_release_lock); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } + up_write(&isk->clcsock_release_lock); sock_put(new_sk); /* final */ continue; } @@ -3032,9 +3034,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) @@ -3046,7 +3048,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -3112,19 +3114,19 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return rc; } diff --git a/net/smc/smc.h b/net/smc/smc.h index b910fcfc4622..7e946c9e3099 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -298,7 +298,7 @@ struct smc_sock { /* smc sock container */ /* non-blocking connect in * flight */ - struct mutex clcsock_release_lock; + struct rw_semaphore clcsock_release_lock; /* protects clcsock of a listen * socket * */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 9a75119b3437..365831c683f1 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,13 +795,13 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock || !smc->clcsock->sk) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EPROTO; } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index c6b550930182..74321f6b2230 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -28,7 +28,7 @@ void smc_clcsock_release(struct smc_sock *smc) if (smc->listen_smc && !smc->use_fallback && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); - mutex_lock(&smc->clcsock_release_lock); + down_write(&smc->clcsock_release_lock); /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; @@ -36,7 +36,7 @@ void smc_clcsock_release(struct smc_sock *smc) if (!smc->keep_clcsock) sock_release(tcp); } - mutex_unlock(&smc->clcsock_release_lock); + up_write(&smc->clcsock_release_lock); } static void smc_close_cleanup_listen(struct sock *parent) -- Gitee From 4297dd011d1470f13964375984e160fba94f2aaf Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 18 Jul 2022 19:58:40 +0800 Subject: [PATCH 93/95] anolis: net/smc: remove locks smc_client_lgr_pending and smc_server_lgr_pending ANBZ: #1742 This patch attempts to remove locks named smc_client_lgr_pending and smc_server_lgr_pending, which aim to serialize the creation of link group. However, once link group existed already, those locks are meaningless, worse still, they make incoming connections have to be queued one after the other. Now, the creation of link group is no longer generated by competition, but allocated through following strategy. 1. Try to find a suitable link group, if successd, current connection is considered as non first contact connection. ends. 2. Check the number of connections currently waiting for a suitable link group to be created, if it is not less that the number of link groups to be created multiplied by SMC_RMBS_PER_LGR_MAX, goto 5. 3. increase the number of connections currently waiting, and wait for woken up. 4. goto 1 5. Increase the number of link groups to be created, current connection is considered as the first contact connection. ends. We wake up the connection that was put to sleep in the above strategy through the SMC link state change event. Once the link moves out of the SMC_LNK_ACTIVATING state, decrease the number of link groups to be created, and then wake up at most SMC_RMBS_PER_LGR_MAX connections. A potential optimization is that when link transitions from SMC_LNK_ACTIVATING state to non SMC_LINK_ACTIVE state, only to wake up one connection and force it to be the first contact. This is already on the TODO list Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 11 +- net/smc/smc_core.c | 283 ++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_core.h | 44 +++++++ net/smc/smc_llc.c | 8 +- 4 files changed, 334 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e7800d75c41f..3dc8fcaa77a9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1262,10 +1262,8 @@ static int smc_connect_rdma(struct smc_sock *smc, if (reason_code) return reason_code; - mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); if (reason_code) { - mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -1362,7 +1360,6 @@ static int smc_connect_rdma(struct smc_sock *smc, if (reason_code) goto connect_abort; } - mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; @@ -1372,7 +1369,6 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); - mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; return reason_code; @@ -2465,7 +2461,8 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; - mutex_lock(&smc_server_lgr_pending); + if (ini->is_smcd) + mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); @@ -2503,7 +2500,6 @@ static void smc_listen_work(struct work_struct *work) ini->first_contact_local, ini); if (rc) goto out_unlock; - mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); @@ -2511,7 +2507,8 @@ static void smc_listen_work(struct work_struct *work) goto out_free; out_unlock: - mutex_unlock(&smc_server_lgr_pending); + if (ini->is_smcd) + mutex_unlock(&smc_server_lgr_pending); out_decl: smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index edd86332a78b..6ca90dade3eb 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -46,6 +46,10 @@ struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; +struct smc_lgr_manager smc_lgr_manager = { + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_manager.lock), +}; + static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); @@ -55,6 +59,227 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); static void smc_link_down_work(struct work_struct *work); +/* SMC-R lgr cluster compare func */ +static int smcr_lnk_cluster_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct smc_lnk_cluster_compare_arg *key = arg->key; + const struct smc_lnk_cluster *lnkc = obj; + + if (memcmp(key->peer_systemid, lnkc->peer_systemid, SMC_SYSTEMID_LEN)) + return 1; + + if (memcmp(key->peer_gid, lnkc->peer_gid, SMC_GID_SIZE)) + return 1; + + if (key->smcr_version != SMC_V2 && memcmp(key->peer_mac, lnkc->peer_mac, ETH_ALEN)) + return 1; + + return 0; +} + +/* SMC-R lgr cluster hash func */ +static u32 smcr_lnk_cluster_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smc_lnk_cluster *lnkc = data; + + return jhash2((u32 *)lnkc->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed); +} + +/* SMC-R lgr cluster compare arg hash func */ +static u32 smcr_lnk_cluster_compare_arg_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smc_lnk_cluster_compare_arg *key = data; + + return jhash2((u32 *)key->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed); +} + +static const struct rhashtable_params smcr_lnk_cluster_rhl_params = { + .head_offset = offsetof(struct smc_lnk_cluster, rnode), + .key_len = sizeof(struct smc_lnk_cluster_compare_arg), + .obj_cmpfn = smcr_lnk_cluster_cmpfn, + .obj_hashfn = smcr_lnk_cluster_hashfn, + .hashfn = smcr_lnk_cluster_compare_arg_hashfn, + .automatic_shrinking = true, +}; + +/* hold a reference for smc_lnk_cluster */ +static inline struct smc_lnk_cluster *smc_lnk_cluster_hold(struct smc_lnk_cluster *lnkc) +{ + if (lnkc) + refcount_inc(&lnkc->ref); + return lnkc; +} + +/* release a reference for smc_lnk_cluster */ +static inline void smc_lnk_cluster_put(struct smc_lnk_cluster *lnkc) +{ + bool do_free = false; + + if (!lnkc) + return; + + if (refcount_dec_and_lock(&lnkc->ref, &smc_lgr_manager.lock)) { + do_free = true; + rhashtable_remove_fast(&smc_lgr_manager.lnk_cluster_maps, &lnkc->rnode, + smcr_lnk_cluster_rhl_params); + spin_unlock(&smc_lgr_manager.lock); + } + if (do_free) + kfree(lnkc); +} + +/* Get or create smc_lnk_cluster by key + * This function will hold a reference of returned smc_lnk_cluster + * or set refcount to one if have to create. + * caller MUST call smc_lnk_cluster_put after this. + */ +static inline struct smc_lnk_cluster * +smcr_lnk_get_or_create_cluster(struct smc_lnk_cluster_compare_arg *key) +{ + struct smc_lnk_cluster *lnkc; + int err; + + spin_lock(&smc_lgr_manager.lock); + lnkc = rhashtable_lookup_fast(&smc_lgr_manager.lnk_cluster_maps, key, + smcr_lnk_cluster_rhl_params); + if (!lnkc) { + lnkc = kzalloc(sizeof(*lnkc), GFP_ATOMIC); + if (unlikely(!lnkc)) + goto fail; /* decline */ + + /* init cluster */ + spin_lock_init(&lnkc->lock); + init_waitqueue_head(&lnkc->first_contact_waitqueue); + memcpy(lnkc->peer_systemid, key->peer_systemid, SMC_SYSTEMID_LEN); + memcpy(lnkc->peer_gid, key->peer_gid, SMC_GID_SIZE); + memcpy(lnkc->peer_mac, key->peer_mac, ETH_ALEN); + refcount_set(&lnkc->ref, 1); + + err = rhashtable_insert_fast(&smc_lgr_manager.lnk_cluster_maps, &lnkc->rnode, + smcr_lnk_cluster_rhl_params); + if (unlikely(err)) { + pr_warn_ratelimited("rhashtable_insert_fast failed"); + kfree(lnkc); + lnkc = NULL; + } + } else { + lnkc = smc_lnk_cluster_hold(lnkc); + } +fail: + spin_unlock(&smc_lgr_manager.lock); + return lnkc; +} + +/* caller MUST call smc_lnk_cluster_put after this. + */ +static inline struct smc_lnk_cluster *smcr_lnk_get_cluster(struct smc_link *lnk) +{ + struct smc_lnk_cluster_compare_arg key; + struct smc_link_group *lgr; + + lgr = lnk->lgr; + if (!lgr || lgr->is_smcd || lgr->role != SMC_SERV) + return NULL; + + key.smcr_version = lgr->smc_version; + key.peer_systemid = lgr->peer_systemid; + key.peer_gid = lnk->peer_gid; + key.peer_mac = lnk->peer_mac; + + return smcr_lnk_get_or_create_cluster(&key); +} + +/* caller MUST call smc_lnk_cluster_put after this. + */ +static inline struct smc_lnk_cluster * +smcr_lnk_get_cluster_by_ini(struct smc_init_info *ini, int role) +{ + struct smc_lnk_cluster_compare_arg key; + + if (ini->is_smcd || role != SMC_SERV) + return NULL; + + key.smcr_version = ini->smcr_version; + key.peer_systemid = ini->peer_systemid; + key.peer_gid = ini->peer_gid; + key.peer_mac = ini->peer_mac; + + return smcr_lnk_get_or_create_cluster(&key); +} + +/* callback when smc link state change */ +void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk, struct smc_init_info *ini) +{ + struct smc_lnk_cluster *lnkc; + int nr = 0; + + /* barrier for lnk->state */ + smp_wmb(); + + /* only first link & server can made connections block on + * first_contact_waitqueue + */ + if (lnk->link_idx != SMC_SINGLE_LINK || lnk->lgr->role != SMC_SERV) + return; + + /* state already seen */ + if (lnk->state_record & SMC_LNK_STATE_BIT(lnk->state)) + return; + + /* before smc_link_save_peer_info, we can not find lnkc + * by lnk + */ + lnkc = ini ? smcr_lnk_get_cluster_by_ini(ini, SMC_SERV) : + smcr_lnk_get_cluster(lnk); + + if (unlikely(!lnkc)) + return; + + spin_lock(&lnkc->lock); + + /* all lnk state change should be + * 1. SMC_LNK_UNUSED -> SMC_LNK_TEAR_DWON (link init failed) + * 2. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_TEAR_DWON + * 3. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_INACTIVE -> SMC_LNK_TEAR_DWON + * 4. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_INACTIVE -> SMC_LNK_TEAR_DWON + * 5. SMC_LNK_UNUSED -> SMC_LNK_ATIVATING -> SMC_LNK_ACTIVE ->SMC_LNK_INACTIVE + * -> SMC_LNK_TEAR_DWON + */ + switch (lnk->state) { + case SMC_LNK_ACTIVATING: + /* It's safe to hold a reference without lock + * dues to the smcr_lnk_get_cluster already hold one + */ + smc_lnk_cluster_hold(lnkc); + break; + case SMC_LNK_TEAR_DWON: + if (lnk->state_record & SMC_LNK_STATE_BIT(SMC_LNK_ACTIVATING)) + /* smc_lnk_cluster_hold in SMC_LNK_ACTIVATING */ + smc_lnk_cluster_put(lnkc); + fallthrough; + case SMC_LNK_ACTIVE: + case SMC_LNK_INACTIVE: + if (!(lnk->state_record & + (SMC_LNK_STATE_BIT(SMC_LNK_ACTIVE) + | SMC_LNK_STATE_BIT(SMC_LNK_INACTIVE)))) { + lnkc->pending_capability -= (SMC_RMBS_PER_LGR_MAX - 1); + /* TODO: wakeup just one to perfrom first contact + * if record state has no SMC_LNK_ACTIVE + */ + nr = SMC_RMBS_PER_LGR_MAX - 1; + } + break; + case SMC_LNK_UNUSED: + pr_warn_ratelimited("smc: invalid lnk state. "); + break; + } + SMC_LNK_STATE_RECORD(lnk, lnk->state); + spin_unlock(&lnkc->lock); + if (nr) + wake_up_nr(&lnkc->first_contact_waitqueue, nr); + smc_lnk_cluster_put(lnkc); /* smc_lnk_cluster_hold in smcr_lnk_get_cluster */ +} + /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) @@ -648,8 +873,10 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (smc_link_sendable(lnk)) + if (smc_link_sendable(lnk)) { lnk->state = SMC_LNK_INACTIVE; + smcr_lnk_cluster_on_lnk_state(lnk, NULL); + } } wake_up_all(&lgr->llc_msg_waiter); wake_up_all(&lgr->llc_flow_waiter); @@ -806,6 +1033,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (rc) goto destroy_qp; lnk->state = SMC_LNK_ACTIVATING; + smcr_lnk_cluster_on_lnk_state(lnk, ini); return 0; destroy_qp: @@ -820,6 +1048,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; + lnk->state = SMC_LNK_TEAR_DWON; + smcr_lnk_cluster_on_lnk_state(lnk, ini); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) @@ -1269,6 +1499,8 @@ static void __smcr_link_clear(struct smc_link *lnk) sock_release(lnk->clcsock); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; + lnk->state = SMC_LNK_TEAR_DWON; + smcr_lnk_cluster_on_lnk_state(lnk, NULL); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) @@ -1741,6 +1973,7 @@ void smcr_link_down_cond(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); + smcr_lnk_cluster_on_lnk_state(lnk, NULL); smcr_link_down(lnk); } } @@ -1750,6 +1983,7 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); + smcr_lnk_cluster_on_lnk_state(lnk, NULL); schedule_work(&lnk->link_down_wrk); } } @@ -1893,11 +2127,13 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct net *net = sock_net(&smc->sk); + DECLARE_WAITQUEUE(wait, current); + struct smc_lnk_cluster *lnkc = NULL; struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; spinlock_t *lgr_lock; - int rc = 0; + int rc = 0, timeo = CLC_WAIT_TIME; lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : &smc_lgr_list.list; @@ -1905,12 +2141,20 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + + if (!ini->is_smcd && role == SMC_SERV) { + lnkc = smcr_lnk_get_cluster_by_ini(ini, role); + if (unlikely(!lnkc)) + return SMC_CLC_DECL_INTERR; + } + if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(lgr_lock); +again: list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? @@ -1937,9 +2181,35 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } write_unlock_bh(&lgr->conns_lock); } + if (lnkc && ini->first_contact_local) { + spin_lock(&lnkc->lock); + if (lnkc->pending_capability > lnkc->conns_pending) { + lnkc->conns_pending++; + spin_unlock(&lnkc->lock); + spin_unlock_bh(lgr_lock); + + add_wait_queue(&lnkc->first_contact_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + /* need to wait at least once first contact done */ + timeo = schedule_timeout(timeo); + set_current_state(TASK_RUNNING); + remove_wait_queue(&lnkc->first_contact_waitqueue, &wait); + spin_lock_bh(lgr_lock); + spin_lock(&lnkc->lock); + + lnkc->conns_pending--; + if (timeo) { + spin_unlock(&lnkc->lock); + goto again; + } + } + /* first_contact */ + lnkc->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1); + spin_unlock(&lnkc->lock); + } spin_unlock_bh(lgr_lock); if (rc) - return rc; + goto out; if (role == SMC_CLNT && !ini->first_contact_peer && ini->first_contact_local) { @@ -1947,7 +2217,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) * a new one * send out_of_sync decline, reason synchr. error */ - return SMC_CLC_DECL_SYNCERR; + rc = SMC_CLC_DECL_SYNCERR; + goto out; } create: @@ -1988,6 +2259,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) #endif out: + /* smc_lnk_cluster_hold in smcr_lnk_get_or_create_cluster */ + smc_lnk_cluster_put(lnkc); return rc; } @@ -2646,6 +2919,8 @@ static struct notifier_block smc_reboot_notifier = { int __init smc_core_init(void) { + /* init smc lgr manager */ + rhashtable_init(&smc_lgr_manager.lnk_cluster_maps, &smcr_lnk_cluster_rhl_params); return register_reboot_notifier(&smc_reboot_notifier); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e1f613ac2f15..6b1f4acac47f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,40 @@ struct smc_lgr_list { /* list of link group definition */ u32 num; /* unique link group number */ }; +struct smc_lgr_manager { /* manager for link group */ + struct rhashtable lnk_cluster_maps; /* maps of smc_lnk_cluster */ + spinlock_t lock; /* lock for lgr_cm_maps */ +}; + +struct smc_lnk_cluster { + struct rhash_head rnode; /* node for rhashtable */ + struct wait_queue_head first_contact_waitqueue; + /* queue for non first contact to wait + * first contact to be established. + */ + spinlock_t lock; /* protection for link group */ + refcount_t ref; /* refcount for cluster */ + unsigned long pending_capability; + /* maximum pending number of connections that + * need wait first contact complete. + */ + unsigned long conns_pending; + /* connections that are waiting for first contact + * complete + */ + u8 peer_systemid[SMC_SYSTEMID_LEN]; + u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ + u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ +}; + +struct smc_lnk_cluster_compare_arg /* key for smc_lnk_cluster */ +{ + int smcr_version; + u8 *peer_systemid; + u8 *peer_gid; + u8 *peer_mac; +}; + enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ @@ -44,8 +79,14 @@ enum smc_link_state { /* possible states of a link */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ + SMC_LNK_TEAR_DWON, /* link is tear down */ }; +#define SMC_LNK_STATE_BIT(state) (1 << (state)) + +#define SMC_LNK_STATE_RECORD(lnk, state) \ + ((lnk)->state_record |= SMC_LNK_STATE_BIT(state)) + #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ @@ -163,6 +204,7 @@ struct smc_link { int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ + int state_record; /* record of previous state */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ @@ -577,6 +619,8 @@ int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk, struct smc_init_info *ini); + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a0149951d774..5a23123275ba 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1340,6 +1340,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) return; /* no asymmetric link */ if (!smc_link_downing(&lnk_asym->state)) return; + smcr_lnk_cluster_on_lnk_state(lnk_asym, NULL); lnk_new = smc_switch_conns(lgr, lnk_asym, false); smc_wr_tx_wait_no_pending_sends(lnk_asym); if (!lnk_new) @@ -1559,6 +1560,7 @@ int smc_llc_srv_add_link(struct smc_link *link, out_err: if (link_new) { link_new->state = SMC_LNK_INACTIVE; + smcr_lnk_cluster_on_lnk_state(link_new, NULL); smcr_link_clear(link_new, false); } out: @@ -1669,8 +1671,10 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) del_llc->reason = 0; smc_llc_send_message(lnk, &qentry->msg); /* response */ - if (smc_link_downing(&lnk_del->state)) + if (smc_link_downing(&lnk_del->state)) { + smcr_lnk_cluster_on_lnk_state(lnk, NULL); smc_switch_conns(lgr, lnk_del, false); + } smcr_link_clear(lnk_del, true); active_links = smc_llc_active_link_count(lgr); @@ -1743,6 +1747,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) goto out; /* asymmetric link already deleted */ if (smc_link_downing(&lnk_del->state)) { + smcr_lnk_cluster_on_lnk_state(lnk, NULL); if (smc_switch_conns(lgr, lnk_del, false)) smc_wr_tx_wait_no_pending_sends(lnk_del); } @@ -2262,6 +2267,7 @@ void smc_llc_link_active(struct smc_link *link) schedule_delayed_work(&link->llc_testlink_wrk, link->llc_testlink_time); } + smcr_lnk_cluster_on_lnk_state(link, NULL); } /* called in worker context */ -- Gitee From 325b1bb416c66cbf7923ae3ab9e472ee49844a3f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 21 Jul 2022 13:46:54 +0800 Subject: [PATCH 94/95] anolis: net/smc: fix SMC_CLC_DECL_ERR_REGRMB without smc_server_lgr_pending ANBZ: #1742 As commit "net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error cause by server" mentioned, it works only when all connection creations are completely protected by smc_server_lgr_pending lock, since we already cancel the lock, we need to re-fix the issues. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 ++ net/smc/smc_core.c | 11 ++++++++--- net/smc/smc_core.h | 21 +++++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3dc8fcaa77a9..f7554e92a09e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2501,6 +2501,7 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_unlock; } + smc_conn_leave_rtoken_pending(new_smc, ini); smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); @@ -2510,6 +2511,7 @@ static void smc_listen_work(struct work_struct *work) if (ini->is_smcd) mutex_unlock(&smc_server_lgr_pending); out_decl: + smc_conn_leave_rtoken_pending(new_smc, ini); smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); out_free: diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 6ca90dade3eb..6d7ab53ead5e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2169,14 +2169,19 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && - !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { + (SMC_RMBS_PER_LGR_MAX - + bitmap_weight(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) + > atomic_read(&lgr->rtoken_pendings))))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); - if (!rc && delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + if (!rc) { + smc_conn_enter_rtoken_pending(smc, ini); + if (delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); + } break; } write_unlock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6b1f4acac47f..f54dd28e0342 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -309,6 +309,9 @@ struct smc_link_group { struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ + atomic_t rtoken_pendings;/* number of connection that + * lgr assigned but no rtoken got yet + */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ @@ -619,6 +622,24 @@ int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +static inline void smc_conn_enter_rtoken_pending(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr; + + lgr = smc->conn.lgr; + if (lgr && !ini->first_contact_local) + atomic_inc(&lgr->rtoken_pendings); +} + +static inline void smc_conn_leave_rtoken_pending(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr; + + lgr = smc->conn.lgr; + if (lgr && !ini->first_contact_local) + atomic_dec(&lgr->rtoken_pendings); +} + void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk, struct smc_init_info *ini); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) -- Gitee From c34665d74f0be8ef8a01ba2ffe146c7716e2c70f Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 22 Jul 2022 14:05:14 +0800 Subject: [PATCH 95/95] anolis: net/smc: skip smc_llc_flow_initiate while SMC-1RTT ANBZ: #1742 Under SMC-R 1rtt, confirm_rkey phase is skipped, hence it is not necessary to start a llc flow by smc_llc_flow_initiate. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f7554e92a09e..9838ad187b2c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -565,9 +565,11 @@ static int smcr_lgr_reg_rmbs(struct smc_sock *smc, struct smc_link_group *lgr = link->lgr; int i, lnk = 0, rc = 0; - rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); - if (rc) - return rc; + if (!smc->simplify_rkey_exhcange) { + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + } /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ @@ -595,7 +597,8 @@ static int smcr_lgr_reg_rmbs(struct smc_sock *smc, rmb_desc->is_conf_rkey = true; out: mutex_unlock(&lgr->llc_conf_mutex); - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + if (!smc->simplify_rkey_exhcange) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } -- Gitee