From 30a916d39487a68489196639ab608a27b139c0c7 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Fri, 26 May 2023 17:27:46 +0800 Subject: [PATCH 1/9] net/smc: Send directly when TCP_CORK is cleared mainline inclusion from mainline-v5.18-rc1 commit ea785a1a573b390a150010b3c5b81e1ccd8c98a8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=ea785a1a573b390a150010b3c5b81e1ccd8c98a8 -------------------------------- According to the man page of TCP_CORK [1], if set, don't send out partial frames. All queued partial frames are sent when option is cleared again. When applications call setsockopt to disable TCP_CORK, this call is protected by lock_sock(), and tries to mod_delayed_work() to 0, in order to send pending data right now. However, the delayed work smc_tx_work is also protected by lock_sock(). There introduces lock contention for sending data. To fix it, send pending data directly which acts like TCP, without lock_sock() protected in the context of setsockopt (already lock_sock()ed), and cancel unnecessary dealyed work, which is protected by lock. [1] https://linux.die.net/man/7/tcp Signed-off-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 7 ++++--- net/smc/smc_tx.c | 25 +++++++++++++++---------- net/smc/smc_tx.h | 1 + 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 41cbc7c89c9d..810eac6b8ec2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2236,9 +2236,10 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val) - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); + if (!val) { + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } } break; case TCP_DEFER_ACCEPT: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 52ef1fca0b60..875e49cb6828 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -577,27 +577,32 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -/* Wakeup sndbuf consumers from process context - * since there is more data to transmit - */ -void smc_tx_work(struct work_struct *work) +void smc_tx_pending(struct smc_connection *conn) { - struct smc_connection *conn = container_of(to_delayed_work(work), - struct smc_connection, - tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; - lock_sock(&smc->sk); if (smc->sk.sk_err) - goto out; + return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} + +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit + */ +void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(to_delayed_work(work), + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); -out: + lock_sock(&smc->sk); + smc_tx_pending(conn); release_sock(&smc->sk); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 07e6ad76224a..a59f370b8b43 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -27,6 +27,7 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); -- Gitee From 90a5df7aff3753e2b14656ab934a7c68f6958eba Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Fri, 26 May 2023 17:57:18 +0800 Subject: [PATCH 2/9] net/smc: Remove corked dealyed work mainline inclusion from mainline-v5.18-rc1 commit 139653bc6635bcf0923a1d4fa06d3ac594528dd9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=139653bc6635bcf0923a1d4fa06d3ac594528dd9 -------------------------------- Based on the manual of TCP_CORK [1] and MSG_MORE [2], these two options have the same effect. Applications can set these options and informs the kernel to pend the data, and send them out only when the socket or syscall does not specify this flag. In other words, there's no need to send data out by a delayed work, which will queue a lot of work. This removes corked delayed work with SMC_TX_CORK_DELAY (250ms), and the applications control how/when to send them out. It improves the performance for sendfile and throughput, and remove unnecessary race of lock_sock(). This also unlocks the limitation of sndbuf, and try to fill it up before sending. [1] https://linux.die.net/man/7/tcp [2] https://man7.org/linux/man-pages/man2/send.2.html Signed-off-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc_tx.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 875e49cb6828..e00975b4b9e0 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -29,7 +29,6 @@ #include "smc_tx.h" #define SMC_TX_WORK_DELAY 0 -#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -223,15 +222,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) - /* for a corked socket defer the RDMA writes if there - * is still sufficient sndbuf_space available + (atomic_read(&conn->sndbuf_space))) + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. */ - queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, - SMC_TX_CORK_DELAY); - else - smc_tx_sndbuf_nonempty(conn); + continue; + smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; -- Gitee From c0be73a2b457f17a17bde13091b1895cb8c606f8 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 09:42:34 +0800 Subject: [PATCH 3/9] net/smc: Cork when sendpage with MSG_SENDPAGE_NOTLAST flag mainline inclusion from mainline-v5.18-rc1 commit be9a16cccaefac23cb16909e04bb65e62e09d515 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=be9a16cccaefac23cb16909e04bb65e62e09d515 -------------------------------- This introduces a new corked flag, MSG_SENDPAGE_NOTLAST, which is involved in syscall sendfile() [1], it indicates this is not the last page. So we can cork the data until the page is not specify this flag. It has the same effect as MSG_MORE, but existed in sendfile() only. This patch adds a option MSG_SENDPAGE_NOTLAST for corking data, try to cork more data before sending when using sendfile(), which acts like TCP's behaviour. Also, this reimplements the default sendpage to inform that it is supported to some extent. [1] https://man7.org/linux/man-pages/man2/sendfile.2.html Signed-off-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 7 +++++-- net/smc/smc_tx.c | 19 ++++++++++++++++++- net/smc/smc_tx.h | 2 ++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 810eac6b8ec2..25d9580d583a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2365,8 +2365,11 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, if (smc->use_fallback) rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); - else - rc = sock_no_sendpage(sock, page, offset, size, flags); + else { + lock_sock(sk); + rc = smc_tx_sendpage(smc, page, offset, size, flags); + release_sock(sk); + } out: return rc; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index e00975b4b9e0..123cb0d332e9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -221,7 +221,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && (atomic_read(&conn->sndbuf_space))) /* for a corked socket defer the RDMA writes if * sndbuf_space is still available. The applications @@ -241,6 +242,22 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return rc; } +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags) +{ + struct msghdr msg = {.msg_flags = flags}; + char *kaddr = kmap(page); + struct kvec iov; + int rc; + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); + rc = smc_tx_sendmsg(smc, &msg, size); + kunmap(page); + return rc; +} + /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index a59f370b8b43..34b578498b1f 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -31,6 +31,8 @@ void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); -- Gitee From 8d1fa0a35f56bea06d4182fa907e457b016ba8e2 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 09:48:27 +0800 Subject: [PATCH 4/9] net/smc: Add comment for smc_tx_pending mainline inclusion from mainline-v5.18-rc1 commit 2e13bde1315373cf7e9fb7dc4330d0d2a6bd5417 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=2e13bde1315373cf7e9fb7dc4330d0d2a6bd5417 -------------------------------- The previous patch introduces a lock-free version of smc_tx_work() to solve unnecessary lock contention, which is expected to be held lock. So this adds comment to remind people to keep an eye out for locks. Suggested-by: Stefan Raspl Signed-off-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc_tx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 123cb0d332e9..430cfc47df5d 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -591,6 +591,10 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit. The caller + * must hold sock lock. + */ void smc_tx_pending(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -606,7 +610,8 @@ void smc_tx_pending(struct smc_connection *conn) } /* Wakeup sndbuf consumers from process context - * since there is more data to transmit + * since there is more data to transmit in locked + * sock. */ void smc_tx_work(struct work_struct *work) { -- Gitee From 20259d306b3601cc05d97df0d3a5c7ea3d3d00e6 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 10:24:31 +0800 Subject: [PATCH 5/9] net/smc: Call trace_smc_tx_sendmsg when data corked mainline inclusion from mainline-v5.18-rc1 commit 6900de507cd471492d83aabd48f13abb9c016d4d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=6900de507cd471492d83aabd48f13abb9c016d4d -------------------------------- This also calls trace_smc_tx_sendmsg() even if data is corked. For ease of understanding, if statements are not expanded here. Link: https://lore.kernel.org/all/f4166712-9a1e-51a0-409d-b7df25a66c52@linux.ibm.com/ Fixes: 139653bc6635 ("net/smc: Remove corked dealyed work") Suggested-by: Stefan Raspl Signed-off-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc_tx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 430cfc47df5d..4379e0981bd5 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -221,15 +221,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - (atomic_read(&conn->sndbuf_space))) - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. - */ - continue; - smc_tx_sndbuf_nonempty(conn); + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space))) + smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; -- Gitee From f80a4ca3ae975df6f24653aa6810f2c877d81e0f Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 14:28:33 +0800 Subject: [PATCH 6/9] net/smc: add autocorking support mainline inclusion from mainline-v5.18-rc1 commit dcd2cf5f2fc0d4d37aa5400b308d401a150c38b6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=dcd2cf5f2fc0d4d37aa5400b308d401a150c38b6 -------------------------------- This patch adds autocorking support for SMC which could improve throughput for small message by x3+. The main idea is borrowed from TCP autocorking with some RDMA specific modification: 1. The first message should never cork to make sure we won't bring extra latency 2. If we have posted any Tx WRs to the NIC that have not completed, cork the new messages until: a) Receive CQE for the last Tx WR b) We have corked enough message on the connection 3. Try to push the corked data out when we receive CQE of the last Tx WR to prevent the corked messages hang in the send queue. Both SMC autocorking and TCP autocorking check the TX completion to decide whether we should cork or not. The difference is when we got a SMC Tx WR completion, the data have been confirmed by the RNIC while TCP TX completion just tells us the data have been sent out by the local NIC. Add an atomic variable tx_pushing in smc_connection to make sure only one can send to let it cork more and save CDC slot. SMC autocorking should not bring extra latency since the first message will always been sent out immediately. The qperf tcp_bw test shows more than x4 increase under small message size with Mellanox connectX4-Lx, same result with other throughput benchmarks like sockperf/netperf. The qperf tcp_lat test shows SMC autocorking has not increase any ping-pong latency. Test command: client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \ -t 30 -vu tcp_{bw|lat} server: smc_run taskset -c 1 qperf === Bandwidth ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 0.578 MB/s 2.392 MB/s(313.57%) 2.647 MB/s(357.72%) 2 1.159 MB/s 4.780 MB/s(312.53%) 5.153 MB/s(344.71%) 4 2.283 MB/s 10.266 MB/s(349.77%) 10.363 MB/s(354.02%) 8 4.668 MB/s 19.040 MB/s(307.86%) 21.215 MB/s(354.45%) 16 9.147 MB/s 38.904 MB/s(325.31%) 41.740 MB/s(356.32%) 32 18.369 MB/s 79.587 MB/s(333.25%) 82.392 MB/s(348.52%) 64 36.562 MB/s 148.668 MB/s(306.61%) 161.564 MB/s(341.89%) 128 72.961 MB/s 274.913 MB/s(276.80%) 325.363 MB/s(345.94%) 256 144.705 MB/s 512.059 MB/s(253.86%) 633.743 MB/s(337.96%) 512 288.873 MB/s 884.977 MB/s(206.35%) 1250.681 MB/s(332.95%) 1024 574.180 MB/s 1337.736 MB/s(132.98%) 2246.121 MB/s(291.19%) 2048 1095.192 MB/s 1865.952 MB/s( 70.38%) 2057.767 MB/s( 87.89%) 4096 2066.157 MB/s 2380.337 MB/s( 15.21%) 2173.983 MB/s( 5.22%) 8192 3717.198 MB/s 2733.073 MB/s(-26.47%) 3491.223 MB/s( -6.08%) 16384 4742.221 MB/s 2958.693 MB/s(-37.61%) 4637.692 MB/s( -2.20%) 32768 5349.550 MB/s 3061.285 MB/s(-42.77%) 5385.796 MB/s( 0.68%) 65536 5162.919 MB/s 3731.408 MB/s(-27.73%) 5223.890 MB/s( 1.18%) ==== Latency ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 10.540 us 11.938 us( 13.26%) 10.573 us( 0.31%) 2 10.996 us 11.992 us( 9.06%) 10.269 us( -6.61%) 4 10.229 us 11.687 us( 14.25%) 10.240 us( 0.11%) 8 10.203 us 11.653 us( 14.21%) 10.402 us( 1.95%) 16 10.530 us 11.313 us( 7.44%) 10.599 us( 0.66%) 32 10.241 us 11.586 us( 13.13%) 10.223 us( -0.18%) 64 10.693 us 11.652 us( 8.97%) 10.251 us( -4.13%) 128 10.597 us 11.579 us( 9.27%) 10.494 us( -0.97%) 256 10.409 us 11.957 us( 14.87%) 10.710 us( 2.89%) 512 11.088 us 12.505 us( 12.78%) 10.547 us( -4.88%) 1024 11.240 us 12.255 us( 9.03%) 10.787 us( -4.03%) 2048 11.485 us 16.970 us( 47.76%) 11.256 us( -1.99%) 4096 12.077 us 13.948 us( 15.49%) 12.230 us( 1.27%) 8192 13.683 us 16.693 us( 22.00%) 13.786 us( 0.75%) 16384 16.470 us 23.615 us( 43.38%) 16.459 us( -0.07%) 32768 22.540 us 40.966 us( 81.75%) 23.284 us( 3.30%) 65536 34.192 us 73.003 us(113.51%) 34.233 us( 0.12%) With SMC autocorking support, we can archive better throughput than TCP in most message sizes without any latency trade-off. Signed-off-by: Dust Li Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/smc.h | 2 + net/smc/smc_cdc.c | 11 +++-- net/smc/smc_tx.c | 105 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 103 insertions(+), 15 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index e6919fe31617..6b1d7bb66f52 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -28,6 +28,7 @@ #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ #define SMC_MAX_HOSTNAME_LEN 32 #define SMC_MAX_EID_LEN 32 @@ -175,6 +176,7 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 94503f36b9a6..23b96b194a08 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,9 +48,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && - unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If this is the last pending WR complete, we must push to + * prevent hang when autocork enabled. + */ + smc_tx_sndbuf_nonempty(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 4379e0981bd5..e5fb97037210 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -127,6 +127,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc) return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min(SMC_AUTOCORKING_DEFAULT_SIZE, + conn->sndbuf_desc->len >> 1); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -221,13 +266,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion */ - if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - atomic_read(&conn->sndbuf_space))) + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ @@ -569,13 +611,24 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) + goto out; if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -583,10 +636,38 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } + +out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } -- Gitee From 0009704803878c1e307513653b6db6ddd6636b33 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 14:42:37 +0800 Subject: [PATCH 7/9] net/smc: send directly on setting TCP_NODELAY mainline inclusion from mainline-v5.18-rc1 commit b70a5cc045197aad9c159042621baf3c015f6cc7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=b70a5cc045197aad9c159042621baf3c015f6cc7 -------------------------------- In commit ea785a1a573b("net/smc: Send directly when TCP_CORK is cleared"), we don't use delayed work to implement cork. This patch use the same algorithm, removes the delayed work when setting TCP_NODELAY and send directly in setsockopt(). This also makes the TCP_NODELAY the same as TCP. Cc: Tony Lu Signed-off-by: Dust Li Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 25d9580d583a..b45ed561eee3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2227,9 +2227,10 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val) - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); + if (val) { + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } } break; case TCP_CORK: -- Gitee From 9eef8c089a51e6f00e5c2685df068ba622e626e9 Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 15:14:52 +0800 Subject: [PATCH 8/9] net/smc: don't send in the BH context if sock_owned_by_user mainline inclusion from mainline-v5.18-rc1 commit 6b88af839d204c9283ae09357555e5c4f56c6da5 category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=6b88af839d204c9283ae09357555e5c4f56c6da5 -------------------------------- Send data all the way down to the RDMA device is a time consuming operation(get a new slot, maybe do RDMA Write and send a CDC, etc). Moving those operations from BH to user context is good for performance. If the sock_lock is hold by user, we don't try to send data out in the BH context, but just mark we should send. Since the user will release the sock_lock soon, we can do the sending there. Add smc_release_cb() which will be called in release_sock() and try send in the callback if needed. This patch moves the sending part out from BH if sock lock is hold by user. In my testing environment, this saves about 20% softirq in the qperf 4K tcp_bw test in the sender side with no noticeable throughput drop. Signed-off-by: Dust Li Signed-off-by: David S. Miller Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 16 ++++++++++++++++ net/smc/smc.h | 4 ++++ net/smc/smc_cdc.c | 19 ++++++++++++++----- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b45ed561eee3..c3df7226eb37 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -104,12 +104,27 @@ void smc_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(smc_unhash_sk); +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -122,6 +137,7 @@ struct proto smc_proto6 = { .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, diff --git a/net/smc/smc.h b/net/smc/smc.h index 6b1d7bb66f52..7f0820b5c42e 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -196,6 +196,10 @@ struct smc_connection { * data still pending */ char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 23b96b194a08..d473e8d80291 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -49,10 +49,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, } if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { - /* If this is the last pending WR complete, we must push to - * prevent hang when autocork enabled. + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() */ - smc_tx_sndbuf_nonempty(conn); + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) wake_up(&conn->cdc_pend_tx_wq); } @@ -354,8 +359,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || - conn->local_rx_ctrl.prod_flags.urg_data_pending) - smc_tx_sndbuf_nonempty(conn); + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } if (diff_cons && conn->urg_tx_pend && atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { -- Gitee From bd4713d53f18b4857e0a20c660cdcaf1f1102ecb Mon Sep 17 00:00:00 2001 From: Litao Jiao Date: Mon, 29 May 2023 15:32:55 +0800 Subject: [PATCH 9/9] net/smc: Send out the remaining data in sndbuf before close mainline inclusion from mainline-v5.18-rc1 commit 906b3d64913c19a50d5c553f21b54d2f4ce3ded7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I78OQ2 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/smc?id=906b3d64913c19a50d5c553f21b54d2f4ce3ded7 -------------------------------- The current autocork algorithms will delay the data transmission in BH context to smc_release_cb() when sock_lock is hold by user. So there is a possibility that when connection is being actively closed (sock_lock is hold by user now), some corked data still remains in sndbuf, waiting to be sent by smc_release_cb(). This will cause: - smc_close_stream_wait(), which is called under the sock_lock, has a high probability of timeout because data transmission is delayed until sock_lock is released. - Unexpected data sends may happen after connction closed and use the rtoken which has been deleted by remote peer through LLC_DELETE_RKEY messages. So this patch will try to send out the remaining corked data in sndbuf before active close process, to ensure data integrity and avoid unexpected data transmission after close. Reported-by: Guangguan Wang Fixes: 6b88af839d20 ("net/smc: don't send in the BH context if sock_owned_by_user") Signed-off-by: Wen Gu Acked-by: Karsten Graul Link: https://lore.kernel.org/r/1648447836-111521-1-git-send-email-guwen@linux.alibaba.com Signed-off-by: Jakub Kicinski Signed-off-by: Litao Jiao --- net/smc/smc_close.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 84102db5bb31..c98930ab5726 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -57,6 +57,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) if (!smc_tx_prepared_sends(&smc->conn)) return; + /* Send out corked data remaining in sndbuf */ + smc_tx_pending(&smc->conn); + smc->wait_close_tx_prepared = 1; add_wait_queue(sk_sleep(sk), &wait); while (!signal_pending(current) && timeout) { -- Gitee