diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst new file mode 100644 index 0000000000000000000000000000000000000000..fc08b26ca7be37662e676a2912cd81011250ac55 --- /dev/null +++ b/Documentation/networking/smc-sysctl.rst @@ -0,0 +1,39 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +SMC Sysctl +========== + +/proc/sys/net/smc/* Variables +============================= + +smcr_buf_type - INTEGER + Controls which type of sndbufs and RMBs to use in later newly created + SMC-R link group. Only for SMC-R. + + Default: 0 (physically contiguous sndbufs and RMBs) + + Possible values: + + - 0 - Use physically contiguous buffers + - 1 - Use virtually contiguous buffers + - 2 - Mixed use of the two types. Try physically contiguous buffers first. + If not available, use virtually contiguous buffers then. + +wmem - INTEGER + Initial size of send buffer used by SMC sockets. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB + for SMC-R using other buf type and 1MiB for SMC-D. + + Default: 64KiB + +rmem - INTEGER + Initial size of receive buffer (RMB) used by SMC sockets. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R using physically contiguous buffers, 256MiB + for SMC-R using other buf type and 1MiB for SMC-D. + + Default: 64KiB diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c7faca9d7447df74062f7f50da991177b632337f..d415ecbd89585134b69c23e39c214222ceaafa93 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -190,8 +191,11 @@ struct net { struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; - +#if IS_ENABLED(CONFIG_SMC) + KABI_USE(1, struct netns_smc *smc) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h new file mode 100644 index 0000000000000000000000000000000000000000..db3d4e704d9cf81553b196f6633e76eff8783295 --- /dev/null +++ b/include/net/netns/smc.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_SMC_H__ +#define __NETNS_SMC_H__ + +struct netns_smc { +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif + unsigned int sysctl_smcr_buf_type; + int sysctl_wmem; + int sysctl_rmem; + int sysctl_tcp2smc; +}; +#endif diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 3960bc3da6b30a6f9d53ffc7fb17afb18f87c900..8b9bdc70d2704c105b189581d507f32ddf37d6d3 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,6 +84,9 @@ enum { }; #endif +/* SMC protocol, IPv4 */ +#define SMCPROTO_SMC 0 + #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 5ad396a57eb327a2e69cc288fe1d287d2d05287e..6c21c85be0e3d57061139e5a1aa70e75dbabfd5b 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,6 +95,8 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 +/* SMC protocol, IPv6 */ +#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/Makefile b/net/smc/Makefile index cb1254541f371f3eba8fbefe533be97ec6ee3ece..79f53cc7d8dcaa7c2f21c9e1be1cc15b1bd72b89 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cfae95bfac147df36f740f502f46330ab48a4afc..96c5976b5cfd34fdd6978ea5ce45e317ccf44567 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -48,6 +48,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -145,11 +146,27 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); } } @@ -266,6 +283,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, 2 * sysctl_smcr_wmem(net)); + WRITE_ONCE(sk->sk_rcvbuf, 2 * sysctl_smcr_rmem(net)); smc = smc_sk(sk); for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { smc->tcp_listen_works[i].smc = smc; @@ -280,6 +299,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); return sk; } @@ -325,13 +345,60 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, return rc; } +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ + +#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_BROADCAST) | \ + (1UL << SOCK_TIMESTAMP) | \ + (1UL << SOCK_DBG) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVTSTAMPNS) | \ + (1UL << SOCK_LOCALROUTE) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_WIFI_STATUS) | \ + (1UL << SOCK_NOFCS) | \ + (1UL << SOCK_FILTER_LOCKED) | \ + (1UL << SOCK_TSTAMP_NEW)) + +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + struct net *nnet = sock_net(nsk); + + nsk->sk_userlocks = osk->sk_userlocks; + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + nsk->sk_sndbuf = osk->sk_sndbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_sndbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); + else + WRITE_ONCE(nsk->sk_sndbuf, + 2 * sysctl_smcr_wmem(nnet)); + } + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + nsk->sk_rcvbuf = osk->sk_rcvbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_rcvbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); + else + WRITE_ONCE(nsk->sk_rcvbuf, + 2 * sysctl_smcr_rmem(nnet)); + } +} + static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, unsigned long mask) { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; - nsk->sk_sndbuf = osk->sk_sndbuf; - nsk->sk_rcvbuf = osk->sk_rcvbuf; nsk->sk_sndtimeo = osk->sk_sndtimeo; nsk->sk_rcvtimeo = osk->sk_rcvtimeo; nsk->sk_mark = osk->sk_mark; @@ -342,26 +409,10 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, nsk->sk_flags &= ~mask; nsk->sk_flags |= osk->sk_flags & mask; + + smc_adjust_sock_bufsizes(nsk, osk, mask); } -#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ - (1UL << SOCK_KEEPOPEN) | \ - (1UL << SOCK_LINGER) | \ - (1UL << SOCK_BROADCAST) | \ - (1UL << SOCK_TIMESTAMP) | \ - (1UL << SOCK_DBG) | \ - (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVTSTAMPNS) | \ - (1UL << SOCK_LOCALROUTE) | \ - (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ - (1UL << SOCK_RXQ_OVFL) | \ - (1UL << SOCK_WIFI_STATUS) | \ - (1UL << SOCK_NOFCS) | \ - (1UL << SOCK_FILTER_LOCKED) | \ - (1UL << SOCK_TSTAMP_NEW)) -/* copy only relevant settings and flags of SOL_SOCKET level from smc to - * clc socket (since smc is not called for these options from net/core) - */ static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); @@ -377,6 +428,29 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } +/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + down_write(&lgr->llc_conf_lock); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; + } + up_write(&lgr->llc_conf_lock); + return rc; +} + /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) @@ -388,13 +462,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) return rc; /* protect against parallel smc_llc_cli_rkey_exchange() and - * parallel smcr_link_reg_rmb() + * parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_lock); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; - rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } @@ -440,8 +514,15 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; @@ -541,11 +622,140 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->r0.qp_mtu; } -static void smc_switch_to_fallback(struct smc_sock *smc) +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_replace_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + +static int smc_switch_to_fallback(struct smc_sock *smc) { - wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); - wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); - unsigned long flags; + int rc = 0; + + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + rc = -EBADF; + goto out; + } smc->use_fallback = true; if (smc->sk.sk_socket && smc->sk.sk_socket->file) { @@ -554,22 +764,28 @@ static void smc_switch_to_fallback(struct smc_sock *smc) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* There may be some entries remaining in - * smc socket->wq, which should be removed - * to clcsocket->wq during the fallback. + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. */ - spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); - list_splice_init(&smc_wait->head, &clc_wait->head); - spin_unlock(&clc_wait->lock); - spin_unlock_irqrestore(&smc_wait->lock, flags); + smc_fback_replace_callbacks(smc); } +out: + mutex_unlock(&smc->clcsock_release_lock); + return rc; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc); + int rc = 0; + + rc = smc_switch_to_fallback(smc); + if (rc) { /* fallback fails */ + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; @@ -831,8 +1047,15 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_ERR_REGRMB; + reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } @@ -1224,6 +1447,19 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -1317,8 +1553,15 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); @@ -1403,11 +1646,12 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); - if (reason_code < 0) { /* error, no fallback possible */ + if (reason_code < 0 || + smc_switch_to_fallback(new_smc)) { + /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { @@ -1646,8 +1890,14 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) struct smc_connection *conn = &new_smc->conn; if (!local_first) { + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + return SMC_CLC_DECL_ERR_REGBUF; } return 0; @@ -1762,9 +2012,13 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; - smc_listen_out_connected(new_smc); + rc = smc_switch_to_fallback(new_smc); + if (rc) { + smc_listen_out_err(new_smc); + } else { + new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; + smc_listen_out_connected(new_smc); + } return; } @@ -1876,8 +2130,6 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; - new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); @@ -1892,10 +2144,10 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) { struct smc_sock *lsmc; - lsmc = (struct smc_sock *) - ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) - return; + goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { int idx = atomic_fetch_inc(&lsmc->tcp_listen_work_seq) % @@ -1904,6 +2156,8 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_works[idx].work)) sock_put(&lsmc->sk); } +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -1935,13 +2189,20 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ - smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; - smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); + rc = kernel_listen(smc->clcsock, backlog); if (rc) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; @@ -2051,7 +2312,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); + rc = smc_switch_to_fallback(smc); + if (rc) + goto out; smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; @@ -2260,6 +2523,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else @@ -2269,6 +2537,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2285,8 +2554,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + rc = smc_switch_to_fallback(smc); + if (!rc) + smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; } @@ -2327,13 +2597,23 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; + int rc; smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } /* socket options apply to the CLC socket */ - if (unlikely(!smc->clcsock->ops->getsockopt)) + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; - return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + } + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, @@ -2551,8 +2831,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + out: return rc; @@ -2568,11 +2847,17 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { + int rc; + + rc = smc_sysctl_net_init(net); + if (rc) + return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { + smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } diff --git a/net/smc/smc.h b/net/smc/smc.h index 6fe4cf14dc2754688409e94517af3fa0b8ceb309..3eafd44e363f967992f01eb00dad07f0aeea8d7f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,10 +21,6 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 - -#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ -#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ - #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ @@ -130,6 +126,12 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ @@ -144,7 +146,7 @@ struct smc_connection { struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size_short;/* compressed notation */ + int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update @@ -230,8 +232,14 @@ struct smc_tcp_listen_work { struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. **/ + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -268,6 +276,41 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + +static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 5ee5b2ce29a6e2c99d4f745b6cd435079a385489..c9450ab0e23b2ccb65fb83d3342f668bc5aa7ce4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -658,7 +658,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = conn->lgr->smcd->local_gid; clc->d0.token = conn->rmb_desc->token; - clc->d0.dmbe_size = conn->rmbe_size_short; + clc->d0.dmbe_size = conn->rmbe_size_comp; clc->d0.dmbe_idx = 0; memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); if (version == SMC_V1) { @@ -693,7 +693,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { @@ -704,9 +704,11 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); break; } - clc->r0.rmbe_size = conn->rmbe_size_short; - clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); + clc->r0.rmbe_size = conn->rmbe_size_comp; + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index c579d1d5995a9e94c608eb4f3db521c97d471f5d..8992949900e9f89db98326e4dde3041b644d20db 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -52,7 +52,7 @@ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ea2cf4ff72082e26cb3025bbf814dc4ac9f6e655..7ee6d6102dc1144dc34d8a5249f4f157797bb307 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -215,8 +215,11 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fdf66b4f9fb2a88f8e103da186f246d0224aa927..dcd884a3cb28a618a91d0adeffc5922584174555 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -30,6 +30,7 @@ #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" +#include "smc_sysctl.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -437,6 +438,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) goto free_wq; lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; + lgr->buf_type = sysctl_smcr_buf_type(sock_net(&smc->sk)); atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; @@ -604,35 +606,38 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, return NULL; } -static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { + struct rw_semaphore *lock; /* lock buffer list */ int rc; - if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ down_read(&lgr->llc_conf_lock); - smc_llc_do_delete_rkey(lgr, rmb_desc); - rmb_desc->is_conf_rkey = false; + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; up_read(&lgr->llc_conf_lock); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } - if (rmb_desc->is_reg_err) { + if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - down_write(&lgr->rmbs_lock); - list_del(&rmb_desc->list); - up_write(&lgr->rmbs_lock); + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + down_write(lock); + list_del(&buf_desc->list); + up_write(lock); - smc_buf_free(lgr, true, rmb_desc); + smc_buf_free(lgr, is_rmb, buf_desc); } else { /* memzero_explicit provides potential memory barrier semantics */ - memzero_explicit(rmb_desc->cpu_addr, rmb_desc->len); - WRITE_ONCE(rmb_desc->used, 0); + memzero_explicit(buf_desc->cpu_addr, buf_desc->len); + WRITE_ONCE(buf_desc->used, 0); } } @@ -640,15 +645,21 @@ static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) { - memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); - WRITE_ONCE(conn->sndbuf_desc->used, 0); + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + } else { + memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); + WRITE_ONCE(conn->sndbuf_desc->used, 0); + } } - if (conn->rmb_desc && lgr->is_smcd) { - memzero_explicit(conn->rmb_desc->cpu_addr, - conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); - WRITE_ONCE(conn->rmb_desc->used, 0); - } else if (conn->rmb_desc) { - smcr_buf_unuse(conn->rmb_desc, lgr); + if (conn->rmb_desc) { + if (!lgr->is_smcd) { + smcr_buf_unuse(conn->rmb_desc, true, lgr); + } else { + memzero_explicit(conn->rmb_desc->cpu_addr, + conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); + WRITE_ONCE(conn->rmb_desc->used, 0); + } } } @@ -681,20 +692,21 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - if (is_rmb) + if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) { - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - buf_desc->mr_rx[lnk->link_idx] = NULL; - } + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { + else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); - } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } @@ -762,8 +774,10 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - if (buf_desc->pages) + if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); kfree(buf_desc); } @@ -1380,21 +1394,42 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) return rc; } -/* convert the RMB size into the compressed notation - minimum 16K. +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 14 /* 0 -> 16KB, 1 -> 32KB, .. 14 -> 256MB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size) +static u8 smc_compress_bufsize(struct smc_link_group *lgr, int size, bool is_smcd, bool is_rmb) { - u8 compressed; + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; + u8 compressed, max_phy_compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb && lgr->buf_type != SMCR_VIRT_CONT_BUFS) { + max_phy_compressed = ilog2(max_scat >> 14); + switch (lgr->buf_type) { + case SMCR_MIXED_BUFS: + if (compressed > max_phy_compressed) + break; + fallthrough; // try phys continguous buf + case SMCR_PHYS_CONT_BUFS: + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, max_phy_compressed); + break; + default: + break; + } + } + return compressed; } @@ -1410,7 +1445,7 @@ int smc_uncompress_bufsize(u8 compressed) /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, +static struct smc_buf_desc *smc_buf_get_slot(int bufsize, struct rw_semaphore *lock, struct list_head *buf_list) { @@ -1418,7 +1453,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { - if (cmpxchg(&buf_slot->used, 0, 1) == 0) { + if (buf_slot->len == bufsize && (cmpxchg(&buf_slot->used, 0, 1) == 0)) { up_read(lock); return buf_slot; } @@ -1436,26 +1471,49 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -/* map an rmb buf to a link */ +/* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - int rc; + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; - rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, buf_desc->len); + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { + if (rc != nents) { rc = -EAGAIN; goto free_table; } @@ -1463,11 +1521,13 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, buf_desc->is_dma_need_sync |= smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; @@ -1484,20 +1544,23 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, return rc; } -/* register a new rmb on IB device, - * must be called under lgr->llc_conf_mutex lock +/* register a new buf on IB device, rmb or vzalloced sndbuf + * must be called under lgr->llc_conf_lock lock */ -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; - if (!rmb_desc->is_reg_mr[link->link_idx]) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->is_reg_err = true; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->is_reg_mr[link->link_idx] = true; + buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } @@ -1549,18 +1612,39 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; + /* reg all RMBs for a new link */ down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; - rc = smcr_link_reg_rmb(lnk, buf_desc); - if (rc) - goto out; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + up_write(&lgr->rmbs_lock); + return rc; + } } } -out: + up_write(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + down_write(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + up_write(&lgr->sndbufs_lock); + return rc; + } + } + } + up_write(&lgr->sndbufs_lock); return rc; } @@ -1574,18 +1658,39 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM); - buf_desc->order = get_order(bufsize); - buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_COMP | - __GFP_NORETRY | __GFP_ZERO, - buf_desc->order); - if (!buf_desc->pages) { - kfree(buf_desc); - return ERR_PTR(-EAGAIN); - } - buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); - buf_desc->len = bufsize; + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, @@ -1616,17 +1721,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, return rc; } -#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ - static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; - if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) - return ERR_PTR(-EAGAIN); - /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) @@ -1663,33 +1763,29 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; - int bufsize, bufsize_short; + int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ - int sk_buf_size; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf / 2; + bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf / 2; - - for (bufsize_short = smc_compress_bufsize(sk_buf_size); - bufsize_short >= 0; bufsize_short--) { + bufsize = smc->sk.sk_sndbuf / 2; + for (bufsize_comp = smc_compress_bufsize(lgr, bufsize, is_smcd, is_rmb); + bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; - buf_list = &lgr->rmbs[bufsize_short]; + buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; - buf_list = &lgr->sndbufs[bufsize_short]; + buf_list = &lgr->sndbufs[bufsize_comp]; } - bufsize = smc_uncompress_bufsize(bufsize_short); - if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) - continue; + bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize, lock, buf_list); if (buf_desc) { break; /* found reusable slot */ } @@ -1716,14 +1812,14 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { - smcr_buf_unuse(buf_desc, lgr); + smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size_short = bufsize_short; + conn->rmbe_size_comp = bufsize_comp; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = @@ -1732,7 +1828,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 548271424ee5188d6d271c976a76d60c5cd839e4..2089e4d274a287327e188230fbc78cab85c3740d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -150,9 +150,11 @@ struct smc_buf_desc { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf * incl. rkey provided to peer + * and lkey provided to local */ u32 order; /* allocation order */ @@ -165,6 +167,8 @@ struct smc_buf_desc { u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ }; struct { /* SMC-D */ unsigned short sba_idx; @@ -183,6 +187,7 @@ struct smc_rtoken { /* address/key of remote RMB */ }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_RMB_DEF_SIZE 131072 /* default size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite @@ -199,6 +204,12 @@ enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; +enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, @@ -254,6 +265,7 @@ struct smc_link_group { /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; + enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ @@ -411,7 +423,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index c1d6084a416a9e45e6582fea78faee63ff2a6e05..1413665f31153a3fd28cf7464708c2f43e85cfee 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -398,7 +398,7 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], buf_slot->sgt[link_idx].sgl, buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); @@ -410,20 +410,21 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[link_idx]) + if (buf_slot->mr[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[link_idx] = + buf_slot->mr[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[link_idx])) { + if (IS_ERR(buf_slot->mr[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[link_idx]); - buf_slot->mr_rx[link_idx] = NULL; + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) return -EINVAL; return 0; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 18356f180c154452f73f2bfe976257e27c96f921..6b90bbafdba862d69076b2963181ee8f002c1ddf 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -435,19 +435,22 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, if (smc_link_active(link) && link != send_link) { rkeyllc->rtoken[rtok_ix].link_id = link->link_id; rkeyllc->rtoken[rtok_ix].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); - rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address( - rmb_desc->sgt[link->link_idx].sgl)); + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); rtok_ix++; } } /* rkey of send_link is in rtoken[0] */ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); - rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); put_out: @@ -474,7 +477,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -724,9 +727,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, } rmb = *buf_pos; - addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - addc_llc->rt[i].rmb_vaddr_new = + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); (*num_rkeys_todo)--; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 6687354a9369206777012f34d2b355d2f242874b..44583bf2df8e487cde9cdc1d58de46bd4d8fde4d 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -143,35 +143,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd, static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct smc_sock *smc) { + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); + struct partial_page *partial; struct splice_pipe_desc spd; - struct partial_page partial; - struct smc_spd_priv *priv; - int bytes; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); if (!priv) - return -ENOMEM; - priv->len = len; - priv->smc = smc; - partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; - partial.len = len; - partial.private = (unsigned long)priv; - - spd.nr_pages_max = 1; - spd.nr_pages = 1; - spd.pages = &smc->conn.rmb_desc->pages; - spd.partial = &partial; + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; bytes = splice_to_pipe(pipe, &spd); if (bytes > 0) { sock_hold(&smc->sk); - get_page(smc->conn.rmb_desc->pages); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } atomic_add(bytes, &smc->conn.splice_pending); } + kfree(priv); + kfree(partial); + kfree(pages); return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; } static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 0000000000000000000000000000000000000000..5360a944620273d9328663d0c68af85ab319c175 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#include +#include +#include + +#include "smc_sysctl.h" +#include "smc_core.h" + +static int two = 2; +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int max_sndbuf = INT_MAX / 2; +static int max_rcvbuf = INT_MAX / 2; +static const int net_smc_wmem_init = (64 * 1024); +static const int net_smc_rmem_init = (64 * 1024); + +static struct ctl_table smc_table[] = { + { + .procname = "smcr_buf_type", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "wmem", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + .extra2 = &max_sndbuf, + }, + { + .procname = "rmem", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + .extra2 = &max_rcvbuf, + }, + { + .procname = "tcp2smc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +int sysctl_smcr_buf_type(struct net *net) +{ + return READ_ONCE(net->smc->sysctl_smcr_buf_type); +} + +int sysctl_smcr_wmem(struct net *net) +{ + return READ_ONCE(net->smc->sysctl_wmem); +} + +int sysctl_smcr_rmem(struct net *net) +{ + return READ_ONCE(net->smc->sysctl_rmem); +} + +int __net_init smc_sysctl_net_init(struct net *net) +{ + struct ctl_table *table; + int idx; + + table = smc_table; + net->smc = kmalloc(sizeof(*net->smc), GFP_KERNEL); + if (!net->smc) + goto err_alloc; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_table; + } + + idx = 0; + net->smc->sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + table[idx++].data = &net->smc->sysctl_smcr_buf_type; + WRITE_ONCE(net->smc->sysctl_wmem, net_smc_wmem_init); + table[idx++].data = &net->smc->sysctl_wmem; + WRITE_ONCE(net->smc->sysctl_rmem, net_smc_rmem_init); + table[idx++].data = &net->smc->sysctl_rmem; + net->smc->sysctl_tcp2smc = 0; + table[idx++].data = &net->smc->sysctl_tcp2smc; + + net->smc->smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc->smc_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_table: + kfree(net->smc); +err_alloc: + return -ENOMEM; +} + +void __net_exit smc_sysctl_net_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->smc->smc_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->smc->smc_hdr); + if (!net_eq(net, &init_net)) + kfree(table); + kfree(net->smc); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 0000000000000000000000000000000000000000..bc0893af5c47ec1730725576a919deaa4d06366f --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL +int sysctl_smcr_buf_type(struct net *net); +int sysctl_smcr_wmem(struct net *net); +int sysctl_smcr_rmem(struct net *net); +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net); + +#else +static inline int sysctl_smcr_buf_type(struct net *net) +{ + return SMCR_PHYS_CONT_BUFS; +} + +static inline int sysctl_smcr_wmem(struct net *net) +{ + return SMC_BUF_MIN_SIZE; +} + +static inline int sysctl_smcr_rmem(struct net *net) +{ + return SMC_RMB_DEF_SIZE; +} + +static inline int smc_sysctl_net_init(struct net *net) +{ + return 0; +} + +static inline void smc_sysctl_net_exit(struct net *net) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index ca29268acf4df6306541ad4f83c9bebc6aab63d6..3a4ce37c9f826e7b720437e57a1e93c990433bf5 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -367,6 +367,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -379,7 +380,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, u64 base_addr = dma_addr; if (dst_len < link->qp_attr.cap.max_inline_data) { - base_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; + base_addr = virt_addr; wr->wr.send_flags |= IB_SEND_INLINE; } else { wr->wr.send_flags &= ~IB_SEND_INLINE; @@ -387,8 +388,12 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = base_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; diff --git a/net/socket.c b/net/socket.c index 42c1bbe842362aa5caed38faffb8b80371169788..4b5d80486facdb6e8e322384ec4f2ca2363a7d51 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1369,6 +1369,15 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } +#if IS_ENABLED(CONFIG_SMC) + if (!kern && net->smc && net->smc->sysctl_tcp2smc && + (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP)) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } +#endif err = security_socket_create(family, type, protocol, kern); if (err)