From 9b034f903a300119a5bc766a5ebbf1f37bade07f Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 24 Nov 2022 16:51:56 +0800 Subject: [PATCH 01/76] anolis: net/smc: Revert all Anolis patches ANBZ: #3257 This HUGE patch removes all Anolis patches to backport patches from upstream. Signed-off-by: Tony Lu --- include/net/net_namespace.h | 1 - include/net/netns/smc.h | 22 +- include/net/smc.h | 5 +- include/net/tcp.h | 2 - include/uapi/linux/in.h | 3 - include/uapi/linux/in6.h | 2 - include/uapi/linux/smc.h | 3 - include/uapi/linux/smc_diag.h | 6 - net/ipv4/tcp_input.c | 21 +- net/ipv4/tcp_output.c | 14 +- net/smc/Makefile | 2 +- net/smc/af_smc.c | 398 +++++++++++----------------------- net/smc/smc.h | 33 +-- net/smc/smc_cdc.c | 28 +-- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 9 - net/smc/smc_clc.h | 3 +- net/smc/smc_close.c | 18 +- net/smc/smc_conv.c | 186 ---------------- net/smc/smc_conv.h | 22 -- net/smc/smc_core.c | 85 +------- net/smc/smc_core.h | 20 +- net/smc/smc_diag.c | 35 ++- net/smc/smc_dim.c | 248 --------------------- net/smc/smc_dim.h | 34 --- net/smc/smc_ib.c | 119 +++------- net/smc/smc_ib.h | 13 +- net/smc/smc_llc.c | 101 +-------- net/smc/smc_llc.h | 5 - net/smc/smc_netlink.c | 19 +- net/smc/smc_netlink.h | 5 - net/smc/smc_proc.c | 339 ----------------------------- net/smc/smc_proc.h | 36 --- net/smc/smc_rx.c | 2 - net/smc/smc_sysctl.c | 102 +-------- net/smc/smc_tx.c | 14 +- net/smc/smc_wr.c | 197 +++++++++-------- net/smc/smc_wr.h | 80 +------ net/socket.c | 35 --- 39 files changed, 329 insertions(+), 1941 deletions(-) delete mode 100644 net/smc/smc_conv.c delete mode 100644 net/smc/smc_conv.h delete mode 100644 net/smc/smc_dim.c delete mode 100644 net/smc/smc_dim.h delete mode 100644 net/smc/smc_proc.c delete mode 100644 net/smc/smc_proc.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 220878bfe86b..76e9cce289a4 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,7 +95,6 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; - struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index e1253a8e0233..2adbe2b245df 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,36 +6,18 @@ struct smc_stats_rsn; struct smc_stats; -struct smc_convert { - int wlist_len; - struct mutex wlist_lock; - struct list_head wlist; - int (*smc_conv_match_rcu)(struct net *net, char *comm); -}; - struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - int limit_smc_hs; /* constraint on handshake */ - struct smc_convert smc_conv; + + bool limit_smc_hs; /* constraint on handshake */ #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; - int sysctl_wmem_default; - int sysctl_rmem_default; - int sysctl_tcp2smc; - int sysctl_allow_different_subnet; - int sysctl_disable_multiple_link; - int sysctl_keep_first_contact_clcsock; - /* allow simplify rkey exchange when single link */ - unsigned int sysctl_simplify_rkey_exhcange; - unsigned int sysctl_smc_fastopen; - /* use diff TCP experiment magic code */ - unsigned int sysctl_smc_experiments; }; #endif diff --git a/include/net/smc.h b/include/net/smc.h index 743b4fe74346..e441aa97ad61 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -12,13 +12,10 @@ #define _SMC_H #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ -#define SMC_HTABLE_SHIFT 9 -#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { - unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht[SMC_HTABLE_SIZE]; + struct hlist_head ht; }; int smc_hash_sk(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 011038a36a4b..1e32e3fb58e1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -195,8 +195,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 -/* "SMCO" in EBCDIC encoding */ -#define TCPOPT_SMC_OK_MAGIC 0xE2D4C3D6 /* * TCP option lengths diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 40b1e51b18c9..d1b327036ae4 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,9 +84,6 @@ enum { }; #endif -/* SMC protocol, IPv4 */ -#define SMCPROTO_SMC 0 - #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 6c21c85be0e3..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,8 +95,6 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 -/* SMC protocol, IPv6 */ -#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 4ec01eb8215e..bb4dacca31e7 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -62,9 +62,6 @@ enum { SMC_NETLINK_DUMP_HS_LIMITATION, SMC_NETLINK_ENABLE_HS_LIMITATION, SMC_NETLINK_DISABLE_HS_LIMITATION, - SMC_NETLINK_ADD_TCP2SMC_WLIST, - SMC_NETLINK_DEL_TCP2SMC_WLIST, - SMC_NETLINK_GET_TCP2SMC_WLIST, }; /* SMC_GENL_FAMILY top level attributes */ diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 182efdd3ec91..8cb3a6fef553 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -79,12 +79,6 @@ struct smc_diag_conninfo { struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ struct smc_diag_cursor tx_sent; /* sent cursor */ struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ - __u64 rx_cnt; /* rx counter */ - __u64 tx_cnt; /* tx counter */ - __u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - __u64 rx_bytes; /* rx size */ - __u64 tx_bytes; /* tx size */ - __u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ }; /* SMC_DIAG_LINKINFO */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da8bd811dc66..6a8d53d6540b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3912,26 +3912,15 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, - const struct net *net, int opsize) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && - opsize >= TCPOLEN_EXP_SMC_BASE) { - /* syn ack */ - if (th->ack && net->smc.sysctl_smc_experiments) { - if (get_unaligned_be32(ptr) == TCPOPT_SMC_OK_MAGIC) { - opt_rx->smc_ok = 1; - return true; - } - return false; - } - /* syn only */ - if (get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { - opt_rx->smc_ok = 1; - return true; - } + opsize >= TCPOLEN_EXP_SMC_BASE && + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { + opt_rx->smc_ok = 1; + return true; } } #endif @@ -4094,7 +4083,7 @@ void tcp_parse_options(const struct net *net, break; } - if (smc_parse_options(th, opt_rx, ptr, net, opsize)) + if (smc_parse_options(th, opt_rx, ptr, opsize)) break; opt_rx->saw_unknown = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 77776ec6ad4b..97056f4c0bfd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -417,7 +417,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) #define OPTION_MPTCP (1 << 10) -#define OPTION_SMC_OK BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -429,12 +428,6 @@ static void smc_options_write(__be32 *ptr, u16 *options) (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); - } else if (OPTION_SMC_OK & *options) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_EXP << 8) | - (TCPOLEN_EXP_SMC_BASE)); - *ptr++ = htonl(TCPOPT_SMC_OK_MAGIC); } } #endif @@ -734,15 +727,10 @@ static void smc_set_option_cond(const struct tcp_sock *tp, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) - const struct sock *sk; - - sk = &tp->inet_conn.icsk_inet.sk; - if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= sock_net(sk)->smc.sysctl_smc_experiments ? - OPTION_SMC_OK : OPTION_SMC; + opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } diff --git a/net/smc/Makefile b/net/smc/Makefile index f9935659a436..875efcd126a2 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o smc_conv.o smc_dim.o +smc-y += smc_tracepoint.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bcab1a9102b4..1ce0dc9408b2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -53,8 +52,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_proc.h" -#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -70,15 +67,6 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -static inline int smc_clcsock_enable_fastopen(struct smc_sock *smc, int is_server) -{ - int val = 1; - - return smc->clcsock->ops->setsockopt(smc->clcsock, SOL_TCP, - is_server ? TCP_FASTOPEN : TCP_FASTOPEN_CONNECT, - KERNEL_SOCKPTR(&val), sizeof(val)); -} - int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -136,8 +124,6 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct sock *child; smc = smc_clcsock_user_data(sk); - if (unlikely(!smc)) - goto drop; if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) @@ -195,13 +181,11 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - write_lock_bh(&h->lock); - - head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + head = &h->ht; + write_lock_bh(&h->lock); sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock_bh(&h->lock); return 0; @@ -263,9 +247,6 @@ static void smc_fback_restore_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; - if (!clcsk) - return; - write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = NULL; @@ -319,12 +300,8 @@ static int __smc_release(struct smc_sock *smc) smc_clcsock_release(smc); lock_sock(sk); } - - if (!smc->use_fallback) { - sock_hold(sk); - if (!queue_work(smc_close_wq, &smc->free_work)) - sock_put(sk); - } + if (!smc->use_fallback) + smc_conn_free(&smc->conn); } return rc; @@ -348,7 +325,7 @@ static int smc_release(struct socket *sock) if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - if (smc->connect_nonblock && cancel_work_sync(&smc->connect_work)) + if (cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) @@ -386,29 +363,12 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static void smc_free_work(struct work_struct *work) -{ - struct sock *sk; - struct smc_sock *smc = container_of(work, struct smc_sock, - free_work); - - sk = &smc->sk; - - lock_sock(sk); - if (sk->sk_state == SMC_CLOSED && !smc->use_fallback) - smc_conn_free(&smc->conn); - release_sock(sk); - - sock_put(sk); /* before queue */ -} - static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { struct smc_sock *smc; struct proto *prot; struct sock *sk; - int i = 0; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); @@ -419,16 +379,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - sk->sk_sndbuf = net->smc.sysctl_wmem_default; - sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); - smc->keep_clcsock = false; - for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { - smc->tcp_listen_works[i].smc = smc; - INIT_WORK(&smc->tcp_listen_works[i].work, smc_tcp_listen_work); - } - atomic_set(&smc->tcp_listen_work_seq, 0); - INIT_WORK(&smc->free_work, smc_free_work); + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); @@ -436,13 +388,9 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); - init_rwsem(&smc->clcsock_release_lock); + mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); - /* default behavior from every net namespace */ - smc->simplify_rkey_exhcange = net->smc.sysctl_simplify_rkey_exhcange; - smc->smc_fastopen = net->smc.sysctl_smc_fastopen; - return sk; } @@ -478,11 +426,7 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; - /* use SO_REUSEADDR to keep first contact clcsock */ - if (sock_net(sk)->smc.sysctl_keep_first_contact_clcsock) - smc->clcsock->sk->sk_reuse = SK_CAN_REUSE; - else - smc->clcsock->sk->sk_reuse = sk->sk_reuse; + smc->clcsock->sk->sk_reuse = sk->sk_reuse; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: @@ -567,18 +511,15 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, } /* register the new rmb on all links */ -static int smcr_lgr_reg_rmbs(struct smc_sock *smc, +static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { - struct smc_link *link = smc->conn.lnk; struct smc_link_group *lgr = link->lgr; - int i, lnk = 0, rc = 0; + int i, rc = 0; - if (!smc->simplify_rkey_exhcange) { - rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); - if (rc) - return rc; - } + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ @@ -589,32 +530,24 @@ static int smcr_lgr_reg_rmbs(struct smc_sock *smc, rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; - /* available link count inc */ - lnk++; } - /* do not exchange confirm_rkey msg since there are only one link */ - if (lnk > 1 || !smc->simplify_rkey_exhcange) { - /* exchange confirm_rkey msg with peer */ - rc = smc_llc_do_confirm_rkey(link, rmb_desc); - if (rc) { - rc = -EFAULT; - goto out; - } + /* exchange confirm_rkey msg with peer */ + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; } - rmb_desc->is_conf_rkey = true; out: mutex_unlock(&lgr->llc_conf_mutex); - if (!smc->simplify_rkey_exhcange) - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -661,22 +594,20 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - if (!net->smc.sysctl_disable_multiple_link) { - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; - } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); return 0; } @@ -765,13 +696,6 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; - link->credits_enable = clc->r0.init_credits ? 1 : 0; - if (link->credits_enable) { - atomic_set(&link->peer_rq_credits, clc->r0.init_credits); - // set peer rq credits watermark, if less than init_credits * 2/3, - // then credit announcement is needed. - link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); - } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -936,7 +860,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; - down_read(&smc->clcsock_release_lock); + mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; @@ -951,10 +875,6 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - /* restore sk_reuse which is SK_CAN_REUSE when - * sysctl_keep_first_contact_clcsock enabled. - */ - smc->clcsock->sk->sk_reuse = smc->sk.sk_reuse; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up @@ -963,7 +883,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc_fback_replace_callbacks(smc); } out: - up_read(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); return rc; } @@ -1022,10 +942,8 @@ static void smc_conn_abort(struct smc_sock *smc, int local_first) lgr_valid = true; smc_conn_free(conn); - if (local_first && lgr_valid) { - smc->keep_clcsock = false; + if (local_first && lgr_valid) smc_lgr_cleanup_early(lgr); - } } /* check if there is a rdma device available for this connection. */ @@ -1193,13 +1111,9 @@ static int smc_connect_clc(struct smc_sock *smc, rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; - - release_sock(&smc->sk); /* receive SMC Accept CLC message */ - rc = smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, - SMC_CLC_ACCEPT, CLC_WAIT_TIME); - lock_sock(&smc->sk); - return rc; + return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); } void smc_fill_gid_list(struct smc_link_group *lgr, @@ -1337,11 +1251,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { - if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { - reason_code = SMC_CLC_DECL_CREDITSERR; - goto connect_abort; - } - /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { @@ -1349,7 +1258,7 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } - if (smcr_lgr_reg_rmbs(smc, smc->conn.rmb_desc)) { + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } @@ -1614,11 +1523,6 @@ static void smc_connect_work(struct work_struct *work) if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; - - if (smc->smc_fastopen && - inet_sk(smc->clcsock->sk)->defer_connect) - goto defer_connect; - lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; @@ -1631,7 +1535,6 @@ static void smc_connect_work(struct work_struct *work) rc = 0; } release_sock(smc->clcsock->sk); -defer_connect: lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; @@ -1705,21 +1608,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, break; } - if (!smc->clcsock || - (smc->clcsock && !smc->clcsock->sk)) { - rc = -EBADF; - goto out; - } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { rc = -EALREADY; goto out; } - - if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is_server */ 0)) - smc->smc_fastopen = 0; /* rollback when setsockopt failed */ - rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; @@ -1757,27 +1651,35 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - down_read(&lsmc->clcsock_release_lock); - if (lsmc->clcsock) { - if (lsmc->clcsock->sk->sk_ack_backlog) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); - else - rc = -EAGAIN; - } - up_read(&lsmc->clcsock_release_lock); - if (rc < 0 && rc != -EAGAIN) - lsk->sk_err = -rc; - if (rc < 0 || lsk->sk_state == SMC_CLOSED) - goto err_out; - + release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; - goto err_out; + *new_smc = NULL; + lock_sock(lsk); + goto out; } *new_smc = smc_sk(new_sk); + mutex_lock(&lsmc->clcsock_release_lock); + if (lsmc->clcsock) + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + mutex_unlock(&lsmc->clcsock_release_lock); + lock_sock(lsk); + if (rc < 0 && rc != -EAGAIN) + lsk->sk_err = -rc; + if (rc < 0 || lsk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); + if (new_clcsock) + sock_release(new_clcsock); + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); + sock_put(new_sk); /* final */ + *new_smc = NULL; + goto out; + } + /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function */ @@ -1796,12 +1698,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) } (*new_smc)->clcsock = new_clcsock; - - return 0; -err_out: - *new_smc = NULL; - if (new_clcsock) - sock_release(new_clcsock); +out: return rc; } @@ -1815,8 +1712,8 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); - sk_acceptq_added(parent); spin_unlock(&par->accept_q_lock); + sk_acceptq_added(parent); } /* remove a socket from the accept queue of its parental listening socket */ @@ -1826,16 +1723,11 @@ static void smc_accept_unlink(struct sock *sk) spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); - sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); spin_unlock(&par->accept_q_lock); + sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } -static inline bool smc_accept_queue_empty(struct sock *sk) -{ - return list_empty(&smc_sk(sk)->accept_q); -} - /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ @@ -1851,13 +1743,10 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); - down_write(&isk->clcsock_release_lock); if (isk->clcsock) { - if (!isk->keep_clcsock) - sock_release(isk->clcsock); + sock_release(isk->clcsock); isk->clcsock = NULL; } - up_write(&isk->clcsock_release_lock); sock_put(new_sk); /* final */ continue; } @@ -1893,7 +1782,6 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -1934,10 +1822,8 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - if (!net->smc.sysctl_disable_multiple_link) { - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); - } + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); return 0; } @@ -1947,7 +1833,7 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - if (new_smc->smc_negotiated) + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); if (lsmc->sk.sk_state == SMC_LISTEN) { @@ -2272,7 +2158,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) conn->sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } - if (smcr_lgr_reg_rmbs(new_smc, conn->rmb_desc)) + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; } @@ -2355,7 +2241,6 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -2363,12 +2248,10 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - if (!net->smc.sysctl_allow_different_subnet) { - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); - } + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2449,6 +2332,16 @@ static void smc_listen_work(struct work_struct *work) return; } + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); + return; + } + /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ @@ -2540,13 +2433,13 @@ static void smc_listen_work(struct work_struct *work) static void smc_tcp_listen_work(struct work_struct *work) { - struct smc_tcp_listen_work *twork = - container_of(work, struct smc_tcp_listen_work, work); - struct smc_sock *lsmc = twork->smc; + struct smc_sock *lsmc = container_of(work, struct smc_sock, + tcp_listen_work); struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; + lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ @@ -2554,6 +2447,9 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_inc(&lsmc->queued_smc_hs); + new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2562,27 +2458,13 @@ static void smc_tcp_listen_work(struct work_struct *work) smc_copy_sock_settings_to_smc(new_smc); new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; - - /* check if peer is smc capable */ - if (!tcp_sk(new_smc->clcsock->sk)->syn_smc) { - sock_hold(&new_smc->sk); /* sock_put in passive closing */ - rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - if (rc) - smc_listen_out_err(new_smc); - else - smc_listen_out_connected(new_smc); - } else { - new_smc->smc_negotiated = 1; - atomic_inc(&lsmc->queued_smc_hs); - /* memory barrier */ - smp_mb__after_atomic(); - sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) - sock_put(&new_smc->sk); - } + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) + sock_put(&new_smc->sk); } out: + release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } @@ -2596,10 +2478,8 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { - int idx = atomic_fetch_inc(&lsmc->tcp_listen_work_seq) % - SMC_MAX_TCP_LISTEN_WORKS; sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_works[idx].work)) + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } out: @@ -2653,9 +2533,6 @@ static int smc_listen(struct socket *sock, int backlog) if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; - if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is server */ 1)) - smc->smc_fastopen = 0; /* rollback when setsockopt failed */ - rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); @@ -2678,10 +2555,9 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *sk = sock->sk, *nsk; - DEFINE_WAIT(wait); + DECLARE_WAITQUEUE(wait, current); struct smc_sock *lsmc; long timeo; - bool waited = false; int rc = 0; lsmc = smc_sk(sk); @@ -2694,19 +2570,17 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, goto out; } + /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { + set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } - /* Wait for an incoming connection */ - prepare_to_wait_exclusive(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); - waited = true; release_sock(sk); - if (smc_accept_queue_empty(sk)) - timeo = schedule_timeout(timeo); + timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); @@ -2715,9 +2589,8 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, break; } } - - if (waited) - finish_wait(sk_sleep(sk), &wait); + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); @@ -2752,20 +2625,14 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct smc_sock *smc; - int r = -ENOTCONN; if (peer && (sock->sk->sk_state != SMC_ACTIVE) && (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) - goto out; + return -ENOTCONN; smc = smc_sk(sock->sk); - down_read(&smc->clcsock_release_lock); - if (smc->clcsock && smc->clcsock->ops) - r = smc->clcsock->ops->getname(smc->clcsock, addr, peer); - up_read(&smc->clcsock_release_lock); -out: - return r; + return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) @@ -2840,12 +2707,17 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return rc; } -static inline __poll_t smc_accept_poll(struct sock *parent) +static __poll_t smc_accept_poll(struct sock *parent) { - if (!smc_accept_queue_empty(parent)) - return EPOLLIN | EPOLLRDNORM; + struct smc_sock *isk = smc_sk(parent); + __poll_t mask = 0; - return 0; + spin_lock(&isk->accept_q_lock); + if (!list_empty(&isk->accept_q)) + mask = EPOLLIN | EPOLLRDNORM; + spin_unlock(&isk->accept_q_lock); + + return mask; } static __poll_t smc_poll(struct file *file, struct socket *sock, @@ -2962,7 +2834,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock && !smc->keep_clcsock) + if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; @@ -3058,9 +2930,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - down_read(&smc->clcsock_release_lock); + mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { - up_read(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) @@ -3072,7 +2944,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - up_read(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -3138,19 +3010,19 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); - down_read(&smc->clcsock_release_lock); + mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { - up_read(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { - up_read(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); - up_read(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); return rc; } @@ -3381,6 +3253,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + out: return rc; } @@ -3497,7 +3372,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc, i; + int rc; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3520,7 +3395,7 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); if (!smc_tcp_ls_wq) goto out_pnet; @@ -3567,11 +3442,8 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - - for (i = 0; i < SMC_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); - } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { @@ -3585,25 +3457,9 @@ static int __init smc_init(void) goto out_ib; } - rc = smc_proc_init(); - if (rc) { - pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); - goto out_ulp; - } - - rc = smc_conv_init(); - if (rc) { - pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); - goto out_proc; - } - static_branch_enable(&tcp_have_smc); return 0; -out_proc: - smc_proc_exit(); -out_ulp: - tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3634,8 +3490,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); - smc_conv_exit(); - smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc.h b/net/smc/smc.h index 7e946c9e3099..5ed765ea0c73 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -22,6 +22,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 + +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ + #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ @@ -228,25 +232,12 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ - u64 rx_cnt; /* rx counter */ - u64 tx_cnt; /* tx counter */ - u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ - u64 rx_bytes; /* rx size */ - u64 tx_bytes; /* tx size */ - u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; -#define SMC_MAX_TCP_LISTEN_WORKS 2 - -struct smc_tcp_listen_work { - struct smc_sock *smc; - struct work_struct work; -}; - struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ @@ -260,19 +251,12 @@ struct smc_sock { /* smc sock container */ /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ - struct smc_tcp_listen_work tcp_listen_works[SMC_MAX_TCP_LISTEN_WORKS]; - /* handle tcp socket accepts */ - atomic_t tcp_listen_work_seq;/* used to select tcp_listen_works */ + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ - struct work_struct free_work; /* free smc conn */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ - bool simplify_rkey_exhcange; /* simplify rkey exchange */ - /* enable SMC-R handshake proposal via tcp fastopen */ - bool smc_fastopen; bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ @@ -289,16 +273,11 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ - u8 smc_negotiated : 1; - /* whether the smc_sock - * was successfully negotiated - * via TCP options. - */ u8 connect_nonblock : 1; /* non-blocking connect in * flight */ - struct rw_semaphore clcsock_release_lock; + struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket * */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index fc29948e245b..53f63bfbaf5f 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,36 +111,26 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; - struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; - u8 saved_credits = 0; int rc; - if (unlikely(!READ_ONCE(conn->sndbuf_desc))) - return -EINVAL; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - if (smc_wr_rx_credits_need_announce_frequent(link)) - saved_credits = (u8)smc_wr_rx_get_credits(link); - cdc_msg->credits = saved_credits; + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (likely(!rc)) { + if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_wr_rx_put_credits(link, saved_credits); - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) - wake_up(&conn->cdc_pend_tx_wq); + atomic_dec(&conn->cdc_pend_tx_wr); } return rc; @@ -172,10 +162,8 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (unlikely(rc)) { - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) - wake_up(&conn->cdc_pend_tx_wq); - } + if (unlikely(rc)) + atomic_dec(&conn->cdc_pend_tx_wr); return rc; } @@ -370,8 +358,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if ((diff_cons && smc_tx_prepared_sends(conn) && - conn->local_tx_ctrl.prod_flags.write_blocked) || + if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || conn->local_rx_ctrl.prod_flags.urg_data_pending) { if (!sock_owned_by_user(&smc->sk)) @@ -458,9 +445,6 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ - if (cdc->credits) - smc_wr_tx_put_credits(link, cdc->credits, true); - /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 145ce7997e64..696cc11f2303 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,8 +47,7 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 credits; /* credits synced by every cdc msg */ - u8 reserved[17]; + u8 reserved[18]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 365831c683f1..1472f31480d8 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,13 +795,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; - down_read(&smc->clcsock_release_lock); - if (!smc->clcsock || !smc->clcsock->sk) { - up_read(&smc->clcsock_release_lock); - return -EPROTO; - } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); - up_read(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; @@ -1046,12 +1040,9 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; - clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); - clc->r0.init_credits = - link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 7b068f7e0519..5fee545c9a10 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,7 +63,6 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ -#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -191,7 +190,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved; __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 74321f6b2230..31db7438857c 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -25,18 +25,15 @@ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; - if (smc->listen_smc && !smc->use_fallback && - current_work() != &smc->smc_listen_work) + if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); - down_write(&smc->clcsock_release_lock); - /* don't release clcsock for eRDMA */ + mutex_lock(&smc->clcsock_release_lock); if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; - if (!smc->keep_clcsock) - sock_release(tcp); + sock_release(tcp); } - up_write(&smc->clcsock_release_lock); + mutex_unlock(&smc->clcsock_release_lock); } static void smc_close_cleanup_listen(struct sock *parent) @@ -202,7 +199,6 @@ int smc_close_active(struct smc_sock *smc) long timeout; int rc = 0; int rc1 = 0; - int i = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? @@ -227,8 +223,7 @@ int smc_close_active(struct smc_sock *smc) } smc_close_cleanup_listen(sk); release_sock(sk); - for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) - flush_work(&smc->tcp_listen_works[i].work); + flush_work(&smc->tcp_listen_work); lock_sock(sk); break; case SMC_ACTIVE: @@ -244,8 +239,7 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk && - !smc->keep_clcsock) { + if (smc->clcsock && smc->clcsock->sk) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c deleted file mode 100644 index e1f87d1de8a5..000000000000 --- a/net/smc/smc_conv.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include -#include "smc_netlink.h" -#include "smc_conv.h" - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *wlist_elem, *tmp; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - if (*wlist_len >= SMC_MAX_WLIST_LEN) { - mutex_unlock(wlist_lock); - return -EINVAL; - } - - list_for_each_entry(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) - goto out; - } - - wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); - if (!wlist_elem) { - mutex_unlock(wlist_lock); - return -ENOMEM; - } - - strcpy(wlist_elem->task_comm, msg); - list_add_tail_rcu(&wlist_elem->list, wlist); - ++*wlist_len; -out: - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *tmp, *nxt; - char msg[TASK_COMM_LEN]; - struct nlattr *na; - - na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; - if (!na) - return -EINVAL; - - nla_strlcpy(msg, na, TASK_COMM_LEN); - - mutex_lock(wlist_lock); - list_for_each_entry_safe(tmp, nxt, wlist, list) { - if (!strcmp(tmp->task_comm, msg)) { - list_del_rcu(&tmp->list); - synchronize_rcu(); - kfree(tmp); - --*wlist_len; - break; - } - } - mutex_unlock(wlist_lock); - return 0; -} - -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); - struct smc_conv_wlist_elem *tmp; - void *nlh; - - if (cb_ctx->pos[0]) - goto errmsg; - - nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &smc_gen_nl_family, NLM_F_MULTI, - SMC_NETLINK_GET_TCP2SMC_WLIST); - if (!nlh) - goto errmsg; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, - nla_total_size(strlen(tmp->task_comm) + 1), - tmp->task_comm)) { - rcu_read_unlock(); - goto errattr; - } - } - rcu_read_unlock(); - - genlmsg_end(skb, nlh); - cb_ctx->pos[0] = 1; - return skb->len; - -errattr: - genlmsg_cancel(skb, nlh); -errmsg: - return skb->len; -} - -static int smc_match_tcp2smc_wlist(struct net *net, char *comm) -{ - struct list_head *wlist = &net->smc.smc_conv.wlist; - struct smc_conv_wlist_elem *tmp; - - rcu_read_lock(); - list_for_each_entry_rcu(tmp, wlist, list) { - if (!strcmp(tmp->task_comm, comm)) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - return -1; -} - -static int __net_init smc_net_conv_init(struct net *net) -{ - INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); - net->smc.smc_conv.wlist_len = 0; - - mutex_init(&net->smc.smc_conv.wlist_lock); - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, - smc_match_tcp2smc_wlist); - return 0; -} - -static void __net_exit smc_net_conv_exit(struct net *net) -{ - struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; - struct list_head *wlist = &net->smc.smc_conv.wlist; - int *wlist_len = &net->smc.smc_conv.wlist_len; - struct smc_conv_wlist_elem *cur, *nxt; - struct list_head tmp_list; - - rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); - synchronize_rcu(); - - INIT_LIST_HEAD(&tmp_list); - - mutex_lock(wlist_lock); - list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); - *wlist_len = 0; - mutex_unlock(wlist_lock); - - list_for_each_entry_safe(cur, nxt, &tmp_list, list) { - list_del(&cur->list); - kfree(cur); - } -} - -static struct pernet_operations smc_conv_ops = { - .init = smc_net_conv_init, - .exit = smc_net_conv_exit, -}; - -int __init smc_conv_init(void) -{ - return register_pernet_subsys(&smc_conv_ops); -} - -void smc_conv_exit(void) -{ - unregister_pernet_subsys(&smc_conv_ops); -} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h deleted file mode 100644 index 1615b27feede..000000000000 --- a/net/smc/smc_conv.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef NET_SMC_SMC_CONV_H_ -#define NET_SMC_SMC_CONV_H_ -#include -#include -#include - -#define SMC_MAX_WLIST_LEN 32 - -struct smc_conv_wlist_elem { - char task_comm[TASK_COMM_LEN]; - struct list_head list; -}; - -int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); -int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); -int __init smc_conv_init(void); -void smc_conv_exit(void); - -#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5c985a86f186..0b833b73dd6f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -627,20 +627,11 @@ int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) void smc_lgr_cleanup_early(struct smc_link_group *lgr) { - struct smc_link *link; spinlock_t *lgr_lock; - u8 link_idx; if (!lgr) return; - /* ONLY one link expected */ - link_idx = SMC_SINGLE_LINK; - link = &lgr->lnk[link_idx]; - if (link) - /* current is fallback, do not release clcsock */ - link->clcsock = NULL; - smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ @@ -840,13 +831,11 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { - struct smc_ib_device *ibdev; struct smc_link_group *lgr; struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; - int ibport; int rc = 0; int i; @@ -900,6 +889,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ + struct smc_ib_device *ibdev; + int ibport; + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->smc_version = ini->smcr_version; memcpy(lgr->peer_systemid, ini->peer_systemid, @@ -915,13 +907,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) ibdev = ini->ib_dev; ibport = ini->ib_port; } - mutex_lock(&smc_ib_devices.mutex); - if (list_empty(&ibdev->list) || - test_bit(ibport, ibdev->ports_going_away)) { - /* ibdev unavailable */ - rc = SMC_CLC_DECL_NOSMCRDEV; - goto free_wq; - } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); if (smc_wr_alloc_lgr_mem(lgr)) @@ -931,8 +916,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); - if (smc->keep_clcsock) - lnk->clcsock = smc->clcsock; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -949,13 +932,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); - if (!ini->is_smcd) - mutex_unlock(&smc_ib_devices.mutex); return 0; free_wq: - if (!ini->is_smcd) - mutex_unlock(&smc_ib_devices.mutex); destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); @@ -964,16 +943,10 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { - switch (rc) { - case -ENOMEM: + if (rc == -ENOMEM) rc = SMC_CLC_DECL_MEM; - break; - case SMC_CLC_DECL_NOSMCRDEV: - break; - default: + else rc = SMC_CLC_DECL_INTERR; - break; - } } return rc; } @@ -1288,20 +1261,13 @@ static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - struct socket *clcsock; - smcr_buf_unmap_lgr(lnk); - smc_ib_destroy_queue_pair(lnk); - smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); - clcsock = lnk->clcsock; put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; - if (clcsock) - sock_release(clcsock); if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ @@ -1316,9 +1282,12 @@ void smcr_link_clear(struct smc_link *lnk, bool log) lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); smcr_link_put(lnk); /* theoretically last link_put */ } @@ -1338,11 +1307,8 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_UNUSED) - continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - } if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); @@ -1706,9 +1672,6 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); @@ -1898,20 +1861,6 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } -static void smc_rx_tx_counter_init(struct smc_connection *conn) -{ - /* Initialize RX & TX diagnostic inform for each - * connection. These counters mean what smc wants - * net devices "TODO" insead of what has been "DONE" - */ - conn->rx_cnt = 0; - conn->tx_cnt = 0; - conn->tx_corked_cnt = 0; - conn->rx_bytes = 0; - conn->tx_bytes = 0; - conn->tx_corked_bytes = 0; -} - /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1976,9 +1925,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { - /* keep this clcsock for QP reuse */ - if (net->smc.sysctl_keep_first_contact_clcsock) - smc->keep_clcsock = true; rc = smc_lgr_create(smc, ini); if (rc) goto out; @@ -1987,7 +1933,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) { - smc->keep_clcsock = false; smc_lgr_cleanup_early(lgr); goto out; } @@ -2000,7 +1945,6 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); - smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); @@ -2310,7 +2254,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { - int i, rc = 0, lnk_cnt = 0; + int i, rc = 0; /* protect against parallel link reconfiguration */ mutex_lock(&lgr->llc_conf_mutex); @@ -2323,12 +2267,9 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, rc = -ENOMEM; goto out; } - lnk_cnt++; } out: mutex_unlock(&lgr->llc_conf_mutex); - if (!lnk_cnt) - rc = -EINVAL; return rc; } @@ -2378,7 +2319,6 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) bool is_dgraded = false; struct mutex *lock; /* lock buffer list */ int sk_buf_size; - int rc = 0; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -2435,10 +2375,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) return PTR_ERR(buf_desc); if (!is_smcd) { - rc = smcr_buf_map_usable_links(lgr, buf_desc, is_rmb); - if (rc) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { smcr_buf_unuse(buf_desc, is_rmb, lgr); - return rc; + return -ENOMEM; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index d92332436a07..f9b7dd15479d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,12 +21,7 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, - * SMC_WR_BUF_CNT should not be less than 2 * - * SMC_RMBS_PER_LGR_MAX, since every connection at - * least has two rq/sq credits in average, otherwise - * may result in waiting for credits in sending process. - */ +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -85,8 +80,6 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 -#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 - struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -94,7 +87,6 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq; /* cq for recv & send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ @@ -132,15 +124,6 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ - atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ - atomic_t local_rq_credits; /* credits for local rq flowctrl */ - u8 credits_enable; /* credits enable flag, set when negotiation */ - u8 local_cr_watermark_high; /* local rq credits watermark */ - u8 peer_cr_watermark_low; /* peer rq credits watermark */ - u8 credits_update_limit; /* credits update limit for cdc msg */ - struct work_struct credits_announce_work; /* work for credits announcement */ - unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ - u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ @@ -167,7 +150,6 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ - struct socket *clcsock; /* keep for eRDMA */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index bbe00b50b666..25ef26b621a2 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,12 +136,6 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, - .rx_cnt = conn->rx_cnt, - .tx_cnt = conn->tx_cnt, - .tx_corked_cnt = conn->tx_corked_cnt, - .rx_bytes = conn->rx_bytes, - .tx_bytes = conn->tx_bytes, - .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) @@ -202,25 +196,24 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0, slot; + int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - - for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { - head = &prot->h.smc_hash->ht[slot]; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + head = &prot->h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; - } + num++; } out: diff --git a/net/smc/smc_dim.c b/net/smc/smc_dim.c deleted file mode 100644 index 280696beb54a..000000000000 --- a/net/smc/smc_dim.c +++ /dev/null @@ -1,248 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2022, Alibaba Group. - * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. - */ - -#include -#include "smc_dim.h" - -#define SMC_IS_SIGNIFICANT_DIFF(val, ref, threshold) \ - ((ref) && (((100UL * abs((val) - (ref))) / (ref)) >= (threshold))) - -#define SMC_CPMS_THRESHOLD 5 -#define SMC_CPERATIO_THRESHOLD 25 -#define SMC_MAX_FLUCTUATIONS 3 -#define CPU_IDLE_UTIL_THRESHOLD 5 -#define CPU_SOFTIRQ_UTIL_THRESHOLD 10 - -#define SMC_DIM_PARAMS_NUM_PROFILES 4 -#define SMC_DIM_START_PROFILE 0 - -static const struct dim_cq_moder -smc_dim_profile[SMC_DIM_PARAMS_NUM_PROFILES] = { - {1, 0, 2, 0}, - {4, 0, 8, 0}, - {16, 0, 16, 0}, - {32, 0, 32, 0}, -}; - -static void smc_dim_work(struct work_struct *w) -{ - struct dim *dim = container_of(w, struct dim, work); - struct ib_cq *cq = dim->priv; - - u16 usec = smc_dim_profile[dim->profile_ix].usec; - u16 comps = smc_dim_profile[dim->profile_ix].comps; - - dim->state = DIM_START_MEASURE; - cq->device->ops.modify_cq(cq, comps, usec); -} - -void smc_dim_init(struct ib_cq *cq) -{ - struct smc_dim *smc_dim; - struct dim *dim; - - if (!cq->device->ops.modify_cq) - return; - - smc_dim = kzalloc(sizeof(*smc_dim), GFP_KERNEL); - if (!smc_dim) - return; - - smc_dim->use_dim = cq->device->use_cq_dim; - dim = to_dim(smc_dim); - dim->state = DIM_START_MEASURE; - dim->tune_state = DIM_GOING_RIGHT; - dim->profile_ix = SMC_DIM_START_PROFILE; - dim->priv = cq; - cq->dim = dim; - INIT_WORK(&dim->work, smc_dim_work); -} - -void smc_dim_destroy(struct ib_cq *cq) -{ - if (!cq->dim) - return; - - cancel_work_sync(&cq->dim->work); - kfree(cq->dim); -} - -static inline void smc_dim_param_clear(struct dim *dim) -{ - dim->steps_right = 0; - dim->steps_left = 0; - dim->tired = 0; - dim->profile_ix = SMC_DIM_START_PROFILE; - dim->tune_state = DIM_GOING_RIGHT; -} - -static inline void smc_dim_reset(struct dim *dim) -{ - int prev_ix = dim->profile_ix; - - smc_dim_param_clear(dim); - if (prev_ix != dim->profile_ix) - schedule_work(&dim->work); - else - dim->state = DIM_START_MEASURE; -} - -static int smc_dim_step(struct dim *dim) -{ - if (dim->tune_state == DIM_GOING_RIGHT) { - if (dim->profile_ix == (SMC_DIM_PARAMS_NUM_PROFILES - 1)) - return DIM_ON_EDGE; - dim->profile_ix++; - dim->steps_right++; - } - if (dim->tune_state == DIM_GOING_LEFT) { - if (dim->profile_ix == 0) - return DIM_ON_EDGE; - dim->profile_ix--; - dim->steps_left++; - } - - return DIM_STEPPED; -} - -static int smc_dim_stats_compare(struct dim_stats *curr, struct dim_stats *prev) -{ - /* first stat */ - if (!prev->cpms) - return DIM_STATS_BETTER; - - if (SMC_IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms, SMC_CPMS_THRESHOLD)) - return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER : - DIM_STATS_WORSE; - - if (SMC_IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio, SMC_CPERATIO_THRESHOLD)) - return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER : - DIM_STATS_WORSE; - - return DIM_STATS_SAME; -} - -static void smc_dim_exit_parking(struct dim *dim) -{ - dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT; - smc_dim_step(dim); - dim->tired = 0; -} - -static bool smc_dim_decision(struct dim_stats *curr_stats, struct dim *dim) -{ - int prev_state = dim->tune_state; - int prev_ix = dim->profile_ix; - int stats_res = smc_dim_stats_compare(curr_stats, - &dim->prev_stats); - - if (curr_stats->cpms < 50) { - smc_dim_param_clear(dim); - goto out; - } - - switch (dim->tune_state) { - case DIM_PARKING_ON_TOP: - if (stats_res != DIM_STATS_SAME) { - if (dim->tired++ > SMC_MAX_FLUCTUATIONS) - smc_dim_exit_parking(dim); - } else { - dim->tired = 0; - } - break; - case DIM_GOING_RIGHT: - case DIM_GOING_LEFT: - if (stats_res != DIM_STATS_BETTER) { - dim_turn(dim); - } else if (dim_on_top(dim)) { - dim_park_on_top(dim); - break; - } - - if (smc_dim_step(dim) == DIM_ON_EDGE) - dim_park_on_top(dim); - break; - } - -out: - if (prev_state != DIM_PARKING_ON_TOP || - dim->tune_state != DIM_PARKING_ON_TOP) - dim->prev_stats = *curr_stats; - - return dim->profile_ix != prev_ix; -} - -static bool smc_dim_check_utilization(struct dim *dim) -{ - struct smc_dim *smc_dim = to_smcdim(dim); - int cpu = smp_processor_id(); - struct kernel_cpustat kcpustat; - u32 idle_percent, softirq_percent; - u64 wall, wall_idle, diff_wall, softirq; - - wall_idle = get_cpu_idle_time(cpu, &wall, 1); - kcpustat_cpu_fetch(&kcpustat, cpu); - - softirq = div_u64(kcpustat_field(&kcpustat, CPUTIME_SOFTIRQ, cpu), NSEC_PER_USEC); - diff_wall = wall - smc_dim->prev_wall; - idle_percent = div64_u64(100 * (wall_idle - smc_dim->prev_idle), diff_wall); - softirq_percent = div64_u64(100 * (softirq - smc_dim->prev_softirq), diff_wall); - - smc_dim->prev_softirq = softirq; - smc_dim->prev_idle = wall_idle; - smc_dim->prev_wall = wall; - - return idle_percent < CPU_IDLE_UTIL_THRESHOLD && - softirq_percent >= CPU_SOFTIRQ_UTIL_THRESHOLD; -} - -void smc_dim(struct dim *dim, u64 completions) -{ - struct ib_cq *cq = dim->priv; - struct smc_dim *smc_dim = to_smcdim(dim); - struct dim_sample *curr_sample = &dim->measuring_sample; - struct dim_stats curr_stats; - u32 nevents; - - if (unlikely(smc_dim->use_dim != cq->device->use_cq_dim)) { - smc_dim->use_dim = cq->device->use_cq_dim; - if (!smc_dim->use_dim) - smc_dim_reset(dim); - } - - if (!smc_dim->use_dim) - return; - - dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0, - curr_sample->comp_ctr + completions, - &dim->measuring_sample); - - switch (dim->state) { - case DIM_MEASURE_IN_PROGRESS: - nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; - if (nevents < DIM_NEVENTS) - break; - if (!smc_dim_check_utilization(dim)) { - smc_dim_reset(dim); - break; - } - dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); - if (smc_dim_decision(&curr_stats, dim)) { - dim->state = DIM_APPLY_NEW_PROFILE; - schedule_work(&dim->work); - break; - } - fallthrough; - case DIM_START_MEASURE: - dim->state = DIM_MEASURE_IN_PROGRESS; - dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0, - curr_sample->comp_ctr, - &dim->start_sample); - break; - case DIM_APPLY_NEW_PROFILE: - break; - } -} diff --git a/net/smc/smc_dim.h b/net/smc/smc_dim.h deleted file mode 100644 index bc8175f7b708..000000000000 --- a/net/smc/smc_dim.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2022, Alibaba Group. - */ - -#ifndef _SMC_DIM_H -#define _SMC_DIM_H - -#include -#include - -struct smc_dim { - struct dim dim; - bool use_dim; - u64 prev_idle; - u64 prev_softirq; - u64 prev_wall; -}; - -static inline struct smc_dim *to_smcdim(struct dim *dim) -{ - return (struct smc_dim *)dim; -} - -static inline struct dim *to_dim(struct smc_dim *smcdim) -{ - return (struct dim *)smcdim; -} - -void smc_dim_init(struct ib_cq *cq); -void smc_dim_destroy(struct ib_cq *cq); -void smc_dim(struct dim *dim, u64 completions); - -#endif /* _SMC_DIM_H */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9492365c3d05..1cb600767e88 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -27,7 +27,6 @@ #include "smc_wr.h" #include "smc.h" #include "smc_netlink.h" -#include "smc_dim.h" #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ @@ -132,7 +131,10 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + IB_CQ_SOLICITED_MASK); + if (rc) + goto out; rc = smc_wr_rx_post_init(lnk); if (rc) goto out; @@ -622,31 +624,6 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev) -{ - struct smc_ib_cq *smcibcq, *cq; - int min, i; - - smcibcq = smcibdev->smcibcq; - cq = smcibcq; - min = cq->load; - - for (i = 0; i < smcibdev->num_cq; i++) { - if (smcibcq[i].load < min) { - cq = &smcibcq[i]; - min = cq->load; - } - } - - cq->load++; - return cq; -} - -static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) -{ - smcibcq->load--; -} - static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -670,33 +647,27 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) { + if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq); - } lnk->roce_qp = NULL; - lnk->smcibcq = NULL; } /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq = smc_ib_get_least_used_cq(lnk->smcibdev); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq->ib_cq, - .recv_cq = smcibcq->ib_cq, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND. - * RDMA_WRITE consumes send queue entities, - * without recv queue entities. + * there are max. 2 RDMA_WRITE per 1 WR_SEND */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT, + .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, .max_inline_data = 0, @@ -714,12 +685,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) { + if (IS_ERR(lnk->roce_qp)) lnk->roce_qp = NULL; - } else { - lnk->smcibcq = smcibcq; + else smc_wr_remember_qp_attr(lnk); - } return rc; } @@ -866,27 +835,11 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } -static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) -{ - int i; - - for (i = 0; i < smcibdev->num_cq; i++) { - if (smcibdev->smcibcq[i].ib_cq) { - smc_dim_destroy(smcibdev->smcibcq[i].ib_cq); - ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); - } - } - smc_wr_remove_dev(smcibdev); - - kfree(smcibdev->smcibcq); -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct ib_cq_init_attr cqattr = { + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; int cqe_size_order, smc_order; - struct smc_ib_cq *smcibcq; - int i, num_cq; long rc; mutex_lock(&smcibdev->mutex); @@ -898,40 +851,28 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq = num_cq; - smcibdev->smcibcq = kcalloc(num_cq, sizeof(*smcibcq), GFP_KERNEL); - if (!smcibdev->smcibcq) { - rc = -ENOMEM; - goto err; + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; + goto out; } - - /* initialize CQs */ - for (i = 0; i < num_cq; i++) { - smcibcq = &smcibdev->smcibcq[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = i; - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) { - smcibcq->ib_cq = NULL; - goto err; - } - - smc_dim_init(smcibcq->ib_cq); - rc = ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP); - if (rc) - goto err; + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; err: - smc_ib_cleanup_cq(smcibdev); + ib_destroy_cq(smcibdev->roce_cq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -943,7 +884,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - smc_ib_cleanup_cq(smcibdev); + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 62f4e5619147..034295676e88 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,20 +32,15 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ -struct smc_ib_cq { /* ib_cq wrapper for smc */ - struct smc_ib_device *smcibdev; /* parent ib device */ - struct ib_cq *ib_cq; /* real ib_cq for link */ - struct tasklet_struct tasklet; /* tasklet for wr */ - int load; /* load of current cq */ -}; - struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq; /* num of snd/rcv cq */ - struct smc_ib_cq *smcibcq; /* send & recv cqs */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f9fb12382f9e..6fea7a2f7e3b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,8 +75,7 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 init_credits; /* QP rq init credits for rq flowctrl */ - u8 reserved[7]; + u8 reserved[8]; }; struct smc_llc_msg_add_link_cont_rt { @@ -171,12 +170,6 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; -struct smc_llc_msg_announce_credits { /* type 0x0A */ - struct smc_llc_hdr hd; - u8 credits; - u8 reserved[39]; -}; - struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -196,7 +189,6 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; - struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -760,46 +752,6 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -/* send credits announce request or response */ -int smc_llc_announce_credits(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool force) -{ - struct smc_llc_msg_announce_credits *announce_credits; - struct smc_wr_tx_pend_priv *pend; - struct smc_wr_buf *wr_buf; - int rc; - u8 saved_credits = 0; - - if (!link->credits_enable || - (!force && !smc_wr_rx_credits_need_announce(link))) - return 0; - - saved_credits = (u8)smc_wr_rx_get_credits(link); - if (!saved_credits) - /* maybe synced by cdc msg */ - return 0; - - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); - if (rc) { - smc_wr_rx_put_credits(link, saved_credits); - return rc; - } - - announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; - memset(announce_credits, 0, sizeof(*announce_credits)); - announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; - announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); - if (reqresp == SMC_LLC_RESP) - announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; - announce_credits->credits = saved_credits; - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - if (rc) - smc_wr_rx_put_credits(link, saved_credits); - - return rc; -} - /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1063,13 +1015,6 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; - link->credits_enable = add_llc->init_credits ? 1 : 0; - if (link->credits_enable) { - atomic_set(&link->peer_rq_credits, add_llc->init_credits); - // set peer rq credits watermark, if less than init_credits * 2/3, - // then credit announcement is needed. - link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); - } } /* as an SMC client, process an add link request */ @@ -1090,9 +1035,6 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) rc = -ENOMEM; goto out_reject; } - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - goto out_reject; ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { @@ -1218,9 +1160,6 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - goto out; ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) @@ -1466,9 +1405,6 @@ int smc_llc_srv_add_link(struct smc_link *link, rc = -ENOMEM; goto out; } - if (lgr->type == SMC_LGR_SINGLE && - lgr->net->smc.sysctl_disable_multiple_link) - goto out; /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; @@ -1999,10 +1935,6 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; - case SMC_LLC_ANNOUNCE_CREDITS: - if (smc_link_active(link)) - smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); - break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2088,10 +2020,6 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; - case SMC_LLC_ANNOUNCE_CREDITS: - if (smc_link_active(link)) - smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); - break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2185,27 +2113,6 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } -static void smc_llc_announce_credits_work(struct work_struct *work) -{ - struct smc_link *link = container_of(work, - struct smc_link, credits_announce_work); - int rc, retry = 0, agains = 0; - -again: - do { - rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); - } while ((rc == -EBUSY) && smc_link_sendable(link) && - (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); - - if (smc_wr_rx_credits_need_announce(link) && - smc_link_sendable(link) && agains <= 5 && !rc) { - agains++; - goto again; - } - - clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); -} - void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2241,7 +2148,6 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); - INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2273,7 +2179,6 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); - cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2388,10 +2293,6 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, - { - .handler = smc_llc_rx_handler, - .type = SMC_LLC_ANNOUNCE_CREDITS - }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index f8a14643faf4..4404e52b3346 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,8 +20,6 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) -#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) - enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -37,7 +35,6 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, - SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -89,8 +86,6 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); -int smc_llc_announce_credits(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 52dba083b70e..c5a62f6f52ba 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,7 +22,6 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" -#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -127,25 +126,9 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_hs_limitation, }, - { - .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_add_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .doit = smc_nl_del_tcp2smc_wlist, - }, - { - .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, - /* can be retrieved by unprivileged users */ - .dumpit = smc_nl_get_tcp2smc_wlist, - }, }; -static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { - [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, +static const struct nla_policy smc_gen_nl_policy[2] = { [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index aae13737095e..e8c6c3f0e98c 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,11 +15,6 @@ #include #include -enum { - SMC_CMD_ATTR_TCP2SMC = 1, - SMC_CMD_MAX_ATTR, -}; - extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c deleted file mode 100644 index 106887b7b9e1..000000000000 --- a/net/smc/smc_proc.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include "smc.h" -#include "smc_proc.h" -#include "smc_core.h" - -static void *smc_get_next(struct seq_file *seq, void *cur) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - struct net *net = seq_file_net(seq); - struct hlist_head *head; - struct sock *sk = cur; - - if (!sk) { - read_lock(&smc_hash->lock); -get_head: - head = &smc_hash->ht[sp->bucket]; - sk = sk_head(head); - sp->offset = 0; - goto get_sk; - } - ++sp->num; - ++sp->offset; - - sk = sk_next(sk); -get_sk: - sk_for_each_from(sk) { - if (!net_eq(sock_net(sk), net)) - continue; - return sk; - } - sp->offset = 0; - if (++sp->bucket < SMC_HTABLE_SIZE) - goto get_head; - - read_unlock(&smc_hash->lock); - return NULL; -} - -static void *smc_seek_last_pos(struct seq_file *seq) -{ - struct smc_proc_private *sp = seq->private; - int offset = sp->offset; - int orig_num = sp->num; - void *rc = NULL; - - if (sp->bucket >= SMC_HTABLE_SIZE) - goto out; - - rc = smc_get_next(seq, NULL); - while (offset-- && rc) - rc = smc_get_next(seq, rc); - - if (rc) - goto out; - - sp->bucket = 0; -out: - sp->num = orig_num; - return rc; -} - -static void *smc_get_idx(struct seq_file *seq, loff_t pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - sp->bucket = 0; - rc = smc_get_next(seq, NULL); - - while (rc && pos) { - rc = smc_get_next(seq, rc); - --pos; - } - return rc; -} - -static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - if (*pos && *pos == sp->last_pos) { - rc = smc_seek_last_pos(seq); - if (rc) - goto out; - } - - sp->num = 0; - sp->bucket = 0; - sp->offset = 0; - sp->protocol = protocol; - rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - -out: - sp->last_pos = *pos; - return rc; -} - -static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC); -} - -static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC6); -} - -static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) -{ - struct smc_proc_private *sp = seq->private; - const struct in6_addr *dest, *src; - struct smc_link_group *lgr; - struct socket *clcsock; - struct smc_link *lnk; - struct sock *sk; - bool fb = false; - int i; - - fb = smc->use_fallback; - clcsock = smc->clcsock; - sk = &smc->sk; - - if (protocol == SMCPROTO_SMC) - seq_printf(seq, CONN4_ADDR_FM, sp->num, - clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, - clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); - else if (protocol == SMCPROTO_SMC6) { - dest = &clcsock->sk->sk_v6_daddr; - src = &clcsock->sk->sk_v6_rcv_saddr; - seq_printf(seq, CONN6_ADDR_FM, sp->num, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); - } - - seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, - sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); - - lgr = smc->conn.lgr; - lnk = smc->conn.lnk; - - if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { - for (i = 0; i < SMC_LGR_ID_SIZE; i++) - seq_printf(seq, "%02X", lgr->id[i]); - - seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', - lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, - smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); - } else { - seq_puts(seq, "- - - - - - -" - " - - -\n"); - } -} - -static int smc_conn_show(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct socket *clcsock; - struct smc_sock *smc; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, - "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", - "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); - goto out; - } - - smc = smc_sk(v); - clcsock = smc->clcsock; - if (!clcsock) - goto out; - - _conn_show(seq, smc, sp->protocol); -out: - return 0; -} - -static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc = NULL; - - if (v == SEQ_START_TOKEN) { - rc = smc_get_idx(seq, 0); - goto out; - } - rc = smc_get_next(seq, v); -out: - ++*pos; - sp->last_pos = *pos; - return rc; -} - -static void smc_conn_stop(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - - if (v && v != SEQ_START_TOKEN) - read_unlock(&smc_hash->lock); -} - -static struct smc_proc_entry smc_proc[] = { - { - .name = "smc4", - .ops = { - .show = smc_conn_show, - .start = smc_conn4_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .name = "smc6", - .ops = { - .show = smc_conn_show, - .start = smc_conn6_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#endif -}; - -extern struct smc_lgr_list smc_lgr_list; -static int proc_show_links(struct seq_file *seq, void *v) -{ - struct smc_link_group *lgr, *lg; - struct smc_link *lnk; - int i = 0, j = 0; - - seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", - "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", - "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lnk = &lgr->lnk[i]; - if (!smc_link_usable(lnk)) - continue; - for (j = 0; j < SMC_LGR_ID_SIZE; j++) - seq_printf(seq, "%02X", lgr->id[j]); - seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", - lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, - lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, - lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, - lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, - atomic_read(&lnk->local_rq_credits), - atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, - lnk->peer_cr_watermark_low, lnk->flags); - } - } - spin_unlock_bh(&smc_lgr_list.lock); - return 0; -} - -static int proc_open_links(struct inode *inode, struct file *file) -{ - single_open(file, proc_show_links, NULL); - return 0; -} - -static struct proc_ops link_file_ops = { -.proc_open = proc_open_links, -.proc_read = seq_read, -.proc_release = single_release, -}; - -static int __net_init smc_proc_dir_init(struct net *net) -{ - int i, rc = -ENOMEM; - - net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); - if (!net->proc_net_smc) - goto err; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { - if (!proc_create_net_data(smc_proc[i].name, 0444, - net->proc_net_smc, &smc_proc[i].ops, - sizeof(struct smc_proc_private), - NULL)) - goto err_entry; - } - - if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) - goto err_entry; - - return 0; - -err_entry: - for (i -= 1; i >= 0; i--) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -err: - return rc; -} - -static void __net_exit smc_proc_dir_exit(struct net *net) -{ - int i; - - remove_proc_entry("links", net->proc_net_smc); - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) - remove_proc_entry(smc_proc[i].name, net->proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -} - -static struct pernet_operations smc_proc_ops = { - .init = smc_proc_dir_init, - .exit = smc_proc_dir_exit, -}; - -int __init smc_proc_init(void) -{ - return register_pernet_subsys(&smc_proc_ops); -} - -void smc_proc_exit(void) -{ - unregister_pernet_subsys(&smc_proc_ops); -} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h deleted file mode 100644 index faa5eaaee511..000000000000 --- a/net/smc/smc_proc.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _SMC_PROC_H_ -#define _SMC_PROC_H_ - -#include -#include -#include -#include -#include -#include "smc.h" - -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ - "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ - "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") -#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") -#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") - -struct smc_proc_private { - struct seq_net_private p; - int num, bucket, offset; - int protocol; - loff_t last_pos; -}; - -struct smc_proc_entry { - const char *name; - const struct seq_operations ops; -}; - -int __init smc_proc_init(void); -void smc_proc_exit(void); - -#endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 4b548e118268..17c5aee7ee4f 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -450,7 +450,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); - conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -498,7 +497,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); - ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 2b2bf13fc986..39b236f868bd 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -17,10 +17,6 @@ #include "smc.h" #include "smc_core.h" #include "smc_sysctl.h" -#include "smc_core.h" - -static int min_sndbuf = SMC_BUF_MIN_SIZE; -static int min_rcvbuf = SMC_BUF_MIN_SIZE; static int two = 2; @@ -41,92 +37,6 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, - { - .procname = "wmem_default", - .data = &init_net.smc.sysctl_wmem_default, - .maxlen = sizeof(init_net.smc.sysctl_wmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &init_net.smc.sysctl_rmem_default, - .maxlen = sizeof(init_net.smc.sysctl_rmem_default), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { - .procname = "tcp2smc", - .data = &init_net.smc.sysctl_tcp2smc, - .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "allow_different_subnet", - .data = &init_net.smc.sysctl_allow_different_subnet, - .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "limit_handshake", - .data = &init_net.smc.limit_smc_hs, - .maxlen = sizeof(init_net.smc.limit_smc_hs), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "disable_multiple_link", - .data = &init_net.smc.sysctl_disable_multiple_link, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "simplify_rkey_exhcange", - .data = &init_net.smc.sysctl_simplify_rkey_exhcange, - .maxlen = sizeof(init_net.smc.sysctl_simplify_rkey_exhcange), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "fastopen", - .data = &init_net.smc.sysctl_smc_fastopen, - .maxlen = sizeof(init_net.smc.sysctl_smc_fastopen), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "sysctl_smc_experiments", - .data = &init_net.smc.sysctl_smc_experiments, - .maxlen = sizeof(init_net.smc.sysctl_smc_experiments), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "keep_first_contact_clcsock", - .data = &init_net.smc.sysctl_keep_first_contact_clcsock, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; @@ -152,17 +62,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; - net->smc.sysctl_wmem_default = 256 * 1024; - net->smc.sysctl_rmem_default = 384 * 1024; - net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 1; - net->smc.sysctl_keep_first_contact_clcsock = 1; - net->smc.sysctl_disable_multiple_link = 1; - /* default on */ - net->smc.sysctl_simplify_rkey_exhcange = 1; - net->smc.sysctl_smc_fastopen = 1; - /* default off */ - net->smc.sysctl_smc_experiments = 0; + return 0; err_reg: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c7beaa1f38d9..4e8377657a62 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -282,14 +282,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ - if (!smc_tx_should_cork(smc, msg)) { - conn->tx_bytes += copylen; - ++conn->tx_cnt; + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); - } else { - conn->tx_corked_bytes += copylen; - ++conn->tx_corked_cnt; - } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ @@ -357,12 +351,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; - /* rtoken might be deleted if peer freed connection */ - if (!rdma_wr->rkey || - (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { - pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); - return -EINVAL; - } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index a4b9ba6532f9..26f8f240d9e8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -30,7 +30,6 @@ #include "smc.h" #include "smc_wr.h" -#include "smc_dim.h" #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ @@ -131,8 +130,40 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - if (wq_has_sleeper(&link->wr_tx_wait)) - wake_up(&link->wr_tx_wait); + wake_up(&link->wr_tx_wait); +} + +static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) +{ + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int i = 0, rc; + int polled = 0; + +again: + polled++; + do { + memset(&wc, 0, sizeof(wc)); + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_send, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + for (i = 0; i < rc; i++) + smc_wr_tx_process_cqe(&wc[i]); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->send_tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -142,16 +173,11 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; - - if (!smc_wr_tx_get_credit(link)) - return -EBUSY; - for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; - smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -257,7 +283,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - smc_wr_tx_put_credits(link, 1, true); + wake_up(&link->wr_tx_wait); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -280,6 +306,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { @@ -295,6 +323,8 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); @@ -337,6 +367,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; link->wr_reg.mr = mr; @@ -406,7 +438,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) if (wc->byte_len < sizeof(*wr_rx)) return; /* short message */ - temp_wr_id = wc->wr_id / 2; + temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { @@ -415,86 +447,73 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) } } -static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) +static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) { - struct smc_link *link = wc->qp->qp_context; + struct smc_link *link; + int i; - if (wc->status == IB_WC_SUCCESS) { - link->wr_rx_tstamp = jiffies; - smc_wr_rx_demultiplex(wc); - smc_wr_rx_post(link); /* refill WR RX */ - } else { - /* handle status errors */ - switch (wc->status) { - case IB_WC_RETRY_EXC_ERR: - case IB_WC_RNR_RETRY_EXC_ERR: - case IB_WC_WR_FLUSH_ERR: - smcr_link_down_cond_sched(link); - break; - default: + for (i = 0; i < num; i++) { + link = wc[i].qp->qp_context; + if (wc[i].status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(&wc[i]); smc_wr_rx_post(link); /* refill WR RX */ - break; + } else { + /* handle status errors */ + switch (wc[i].status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); + break; + default: + smc_wr_rx_post(link); /* refill WR RX */ + break; + } } } - - if (smc_wr_rx_credits_need_announce(link) && - !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { - set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); - schedule_work(&link->credits_announce_work); - } } -int smc_wr_rx_post_init(struct smc_link *link) +static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - u32 i; - int rc = 0; - - for (i = 0; i < link->wr_rx_cnt; i++) - rc = smc_wr_rx_post(link); - // credits have already been announced to peer - atomic_set(&link->local_rq_credits, 0); - return rc; -} - -static void smc_wr_tasklet_fn(struct tasklet_struct *t) -{ - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc, completed = 0; + int polled = 0; + int rc; again: + polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) { - if (smc_wr_id_is_rx(wc[i].wr_id)) - smc_wr_rx_process_cqe(&wc[i]); - else - smc_wr_tx_process_cqe(&wc[i]); + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_recv, + IB_CQ_SOLICITED_MASK + | IB_CQ_REPORT_MISSED_EVENTS); } - - if (rc > 0) - completed += rc; + if (!rc) + break; + smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) + if (polled == 1) goto again; +} - if (smcibcq->ib_cq->dim) - smc_dim(smcibcq->ib_cq->dim, completed); +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->recv_tasklet); } -void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) +int smc_wr_rx_post_init(struct smc_link *link) { - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; + u32 i; + int rc = 0; - tasklet_schedule(&smcibcq->tasklet); + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + return rc; } /***************************** init, exit, misc ******************************/ @@ -528,7 +547,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, lnk->qp_attr.cap.max_recv_wr); } @@ -555,7 +574,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; - lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED; + lnk->wr_tx_ibs[i].send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; if (send_inline) lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; @@ -575,7 +595,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; lnk->wr_tx_v2_ib->num_sge = 1; lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; - lnk->wr_tx_v2_ib->send_flags = IB_SEND_SIGNALED; + lnk->wr_tx_v2_ib->send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; } /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. @@ -718,7 +739,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -726,7 +747,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -745,7 +766,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -812,20 +833,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq; i++) - tasklet_kill(&smcibdev->smcibcq[i].tasklet); + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq; i++) { - tasklet_setup(&smcibdev->smcibcq[i].tasklet, - smc_wr_tasklet_fn); - } + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) @@ -834,7 +849,7 @@ int smc_wr_create_link(struct smc_link *lnk) int rc = 0; smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); - lnk->wr_rx_id = 1; + lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); @@ -874,16 +889,6 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); - atomic_set(&lnk->peer_rq_credits, 0); - atomic_set(&lnk->local_rq_credits, 0); - lnk->flags = 0; - lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); - lnk->peer_cr_watermark_low = 0; - - /* if credits accumlated less than 10% of wr_rx_cnt(at least 5), - * will not be announced by cdc msg. - */ - lnk->credits_update_limit = max(lnk->wr_rx_cnt / 10, 5U); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 7f3909b5ac64..a54e90a1110f 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,12 +19,7 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT - * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, - * since every connection at least has two rq/sq - * credits in average, otherwise may result in - * waiting for credits in sending process. - */ +#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -56,7 +51,7 @@ struct smc_wr_rx_handler { */ static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link) { - return atomic_long_add_return(2, &link->wr_tx_id); + return atomic_long_inc_return(&link->wr_tx_id); } static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) @@ -88,62 +83,6 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } -// get one tx credit, and peer rq credits dec -static inline int smc_wr_tx_get_credit(struct smc_link *link) -{ - return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; -} - -// put tx credits, when some failures occurred after tx credits got -// or receive announce credits msgs -static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) -{ - if (link->credits_enable && credits) { - atomic_add(credits, &link->peer_rq_credits); - if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) - wake_up_nr(&link->wr_tx_wait, credits); - } -} - -// to check whether peer rq credits is lower than watermark. -static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) -{ - return link->credits_enable && - atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; -} - -// get local rq credits and set credits to zero. -// may called when announcing credits -static inline int smc_wr_rx_get_credits(struct smc_link *link) -{ - return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; -} - -// called when post_recv a rqe -static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) -{ - if (link->credits_enable && credits) - atomic_add(credits, &link->local_rq_credits); -} - -// to check whether local rq credits is higher than watermark. -static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) -{ - return link->credits_enable && - atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; -} - -static inline int smc_wr_rx_credits_need_announce_frequent(struct smc_link *link) -{ - /* announce when local rq credits accumulated more than credits_update_limit, or - * peer rq credits is empty. As peer credits empty and local credits is less than - * credits_update_limit, may results in credits deadlock. - */ - return link->credits_enable && - (atomic_read(&link->local_rq_credits) >= link->credits_update_limit || - !atomic_read(&link->peer_rq_credits)); -} - /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -151,22 +90,14 @@ static inline int smc_wr_rx_post(struct smc_link *link) u64 wr_id, temp_wr_id; u32 index; - link->wr_rx_id += 2; - wr_id = link->wr_rx_id; /* tasklet context, thus not atomic */ - temp_wr_id = wr_id / 2; + wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */ + temp_wr_id = wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); - if (!rc) - smc_wr_rx_put_credits(link, 1); return rc; } -static inline bool smc_wr_id_is_rx(u64 wr_id) -{ - return wr_id % 2; -} - int smc_wr_create_link(struct smc_link *lnk); int smc_wr_alloc_link_mem(struct smc_link *lnk); int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); @@ -193,11 +124,12 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); -void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); #endif /* SMC_WR_H */ diff --git a/net/socket.c b/net/socket.c index 3917e02b2b2f..d52c265ad449 100644 --- a/net/socket.c +++ b/net/socket.c @@ -141,38 +141,6 @@ static void sock_show_fdinfo(struct seq_file *m, struct file *f) #define sock_show_fdinfo NULL #endif -#if IS_ENABLED(CONFIG_SMC) -static bool try_tcp2smc_convert(struct net *net, int *family, int type, - int *protocol, int kern) -{ - int (*f)(struct net *n, char *c) = NULL; - - /* Only convert userspace socket */ - if (kern) - return false; - - if ((*family == AF_INET || *family == AF_INET6) && - type == SOCK_STREAM && - (*protocol == IPPROTO_IP || *protocol == IPPROTO_TCP)) { - if (net->smc.sysctl_tcp2smc) - goto convert; - - rcu_read_lock(); - f = rcu_dereference(net->smc.smc_conv.smc_conv_match_rcu); - if (f && !f(net, current->comm)) { - rcu_read_unlock(); - goto convert; - } - rcu_read_unlock(); - } - return false; -convert: - *protocol = (*family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - *family = AF_SMC; - return true; -} -#endif - /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -1399,9 +1367,6 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } -#if IS_ENABLED(CONFIG_SMC) - try_tcp2smc_convert(net, &family, type, &protocol, kern); -#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From 3929c274e16ce7007eb648c097402d1d30adc7e8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Jul 2022 16:09:57 +0200 Subject: [PATCH 02/76] net/smc: Eliminate struct smc_ism_position ANBZ: #3257 commit eb481b02bd182a96e22070895bf887277b82150f upstream. This struct is used in a single place only, and its usage generates inefficient code. Time to clean up! Signed-off-by: Heiko Carstens Reviewed-and-tested-by: Stefan Raspl Signed-off-by: Wenjia Zhang < wenjia@linux.ibm.com> Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_ism.c | 11 ----------- net/smc/smc_ism.h | 20 +++++++++++--------- net/smc/smc_tx.c | 10 +++------- 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index fd28cc498b98..f3c8eb99d681 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -32,17 +32,6 @@ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) vlan_id); } -int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, - void *data, size_t len) -{ - int rc; - - rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal, - pos->offset, data, len); - - return rc < 0 ? rc : 0; -} - void smc_ism_get_system_eid(u8 **eid) { if (!smc_ism_v2_capable) diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 004b22a13ffa..d6b2db604fe8 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -28,13 +28,6 @@ struct smc_ism_vlanid { /* VLAN id set on ISM device */ refcount_t refcnt; /* Reference count */ }; -struct smc_ism_position { /* ISM device position to write to */ - u64 token; /* Token of DMB */ - u32 offset; /* Offset into DMBE */ - u8 index; /* Index of DMBE */ - u8 signal; /* Generate interrupt on owner side */ -}; - struct smcd_dev; int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); @@ -45,12 +38,21 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); -int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, - void *data, size_t len); int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); void smc_ism_init(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); + +static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + return rc < 0 ? rc : 0; +} + #endif diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 4e8377657a62..64dedffe9d26 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -320,15 +320,11 @@ int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal) { - struct smc_ism_position pos; int rc; - memset(&pos, 0, sizeof(pos)); - pos.token = conn->peer_token; - pos.index = conn->peer_rmbe_idx; - pos.offset = conn->tx_off + offset; - pos.signal = signal; - rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); + rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, + conn->peer_rmbe_idx, signal, conn->tx_off + offset, + data, len); if (rc) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return rc; -- Gitee From de17f269a0a173ce96c55f547ef7d51175f67f45 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 25 Jul 2022 16:09:58 +0200 Subject: [PATCH 03/76] s390/ism: Cleanups ANBZ: #3257 commit 0a2f4f9893c83bd722bd55a903fb682da2eb24ba upstream. Reworked signature of the function to retrieve the system EID: No plausible reason to use a double pointer. And neither to pass in the device as an argument, as this identifier is by definition per system, not per device. Plus some minor consistency edits. Signed-off-by: Stefan Raspl Signed-off-by: Wenjia Zhang < wenjia@linux.ibm.com> Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 11 +++++------ include/net/smc.h | 2 +- net/smc/smc_ism.c | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 26cc943d2034..e08697733074 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -409,20 +409,19 @@ static void ism_create_system_eid(void) memcpy(&SYSTEM_EID.type, tmp, 4); } -static void ism_get_system_eid(struct smcd_dev *smcd, u8 **eid) +static u8 *ism_get_system_eid(void) { - *eid = &SYSTEM_EID.seid_string[0]; + return SYSTEM_EID.seid_string; } static u16 ism_get_chid(struct smcd_dev *smcd) { - struct ism_dev *ismdev; + struct ism_dev *ism = (struct ism_dev *)smcd->priv; - ismdev = (struct ism_dev *)smcd->priv; - if (!ismdev || !ismdev->pdev) + if (!ism || !ism->pdev) return 0; - return to_zpci(ismdev->pdev)->pchid; + return to_zpci(ism->pdev)->pchid; } static void ism_handle_event(struct ism_dev *ism) diff --git a/include/net/smc.h b/include/net/smc.h index e441aa97ad61..2688341d6c8e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -65,7 +65,7 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); - void (*get_system_eid)(struct smcd_dev *dev, u8 **eid); + u8* (*get_system_eid)(void); u16 (*get_chid)(struct smcd_dev *dev); }; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index f3c8eb99d681..ab30406ffd0d 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -428,7 +428,7 @@ int smcd_register_dev(struct smcd_dev *smcd) if (list_empty(&smcd_dev_list.list)) { u8 *system_eid = NULL; - smcd->ops->get_system_eid(smcd, &system_eid); + system_eid = smcd->ops->get_system_eid(); if (system_eid[24] != '0' || system_eid[28] != '0') { smc_ism_v2_capable = true; memcpy(smc_ism_v2_system_eid, system_eid, -- Gitee From d0a57b8beeb5ddc2b4ef44d6c70a797f931aefb6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 25 Jul 2022 16:09:59 +0200 Subject: [PATCH 04/76] net/smc: Pass on DMBE bit mask in IRQ handler ANBZ: #3257 commit 8b2fed8e2712e8c23665df3c9e0fbabbb76e466c upstream. Make the DMBE bits, which are passed on individually in ism_move() as parameter idx, available to the receiver. Signed-off-by: Stefan Raspl Signed-off-by: Wenjia Zhang < wenjia@linux.ibm.com> Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 4 +++- include/net/smc.h | 2 +- net/smc/smc_ism.c | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e08697733074..1adb00ca0a0a 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -443,6 +443,7 @@ static irqreturn_t ism_handle_irq(int irq, void *data) struct ism_dev *ism = data; unsigned long bit, end; unsigned long *bv; + u16 dmbemask; bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET]; end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET; @@ -456,9 +457,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data) break; clear_bit_inv(bit, bv); + dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - smcd_handle_irq(ism->smcd, bit + ISM_DMB_BIT_OFFSET); + smcd_handle_irq(ism->smcd, bit + ISM_DMB_BIT_OFFSET, dmbemask); } if (ism->sba->e) { diff --git a/include/net/smc.h b/include/net/smc.h index 2688341d6c8e..421a7197b475 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -94,5 +94,5 @@ int smcd_register_dev(struct smcd_dev *smcd); void smcd_unregister_dev(struct smcd_dev *smcd); void smcd_free_dev(struct smcd_dev *smcd); void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event); -void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit); +void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit, u16 dmbemask); #endif /* _SMC_H */ diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ab30406ffd0d..98de2bc61483 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -507,13 +507,13 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) EXPORT_SYMBOL_GPL(smcd_handle_event); /* SMCD Device interrupt handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer and DMB number. Find the connection and - * schedule the tasklet for this connection. + * Parameters are smcd device pointer, DMB number, and the DMBE bitmask. + * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) +void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) { struct smc_connection *conn = NULL; unsigned long flags; -- Gitee From 469900a0ad8d58a84abd76cfb73b879bf7542508 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 25 Jul 2022 16:10:00 +0200 Subject: [PATCH 05/76] net/smc: Enable module load on netlink usage ANBZ: #3257 commit 28ec53f3a830750f1b5ccf73cb13dae66ade1660 upstream. Previously, the smc and smc_diag modules were automatically loaded as dependencies of the ism module whenever an ISM device was present. With the pending rework of the ISM API, the smc module will no longer automatically be loaded in presence of an ISM device. Usage of an AF_SMC socket will still trigger loading of the smc modules, but usage of a netlink socket will not. This is addressed by setting the correct module aliases. Signed-off-by: Stefan Raspl Signed-off-by: Wenjia Zhang < wenjia@linux.ibm.com> Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 1 + net/smc/smc_diag.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1ce0dc9408b2..14516d3db092 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3514,3 +3514,4 @@ MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); +MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 25ef26b621a2..22d38206ed48 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -268,3 +268,4 @@ module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); +MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME); -- Gitee From 9e8e574a10b85917c231cb92d15e4473f0a35725 Mon Sep 17 00:00:00 2001 From: Yacan Liu Date: Tue, 30 Aug 2022 23:23:14 +0800 Subject: [PATCH 06/76] net/smc: Remove redundant refcount increase ANBZ: #3257 commit 9c5d03d362519f36cd551aec596388f895c93d2d upstream. For passive connections, the refcount increment has been done in smc_clcsock_accept()-->smc_sock_alloc(). Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Signed-off-by: Yacan Liu Reviewed-by: Tony Lu Link: https://lore.kernel.org/r/20220830152314.838736-1-liuyacan@corp.netease.com Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 14516d3db092..3d66270d572d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1854,7 +1854,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; - sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; -- Gitee From f11b0fec0d043db21d0ce53b463e2dee357939c4 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 20 Sep 2022 14:43:09 +0800 Subject: [PATCH 07/76] net/smc: Stop the CLC flow if no link to map buffers on ANBZ: #3257 commit e738455b2c6dcdab03e45d97de36476f93f557d2 upstream. There might be a potential race between SMC-R buffer map and link group termination. smc_smcr_terminate_all() | smc_connect_rdma() -------------------------------------------------------------- | smc_conn_create() for links in smcibdev | schedule links down | | smc_buf_create() | \- smcr_buf_map_usable_links() | \- no usable links found, | (rmb->mr = NULL) | | smc_clc_send_confirm() | \- access conn->rmb_desc->mr[]->rkey | (panic) During reboot and IB device module remove, all links will be set down and no usable links remain in link groups. In such situation smcr_buf_map_usable_links() should return an error and stop the CLC flow accessing to uninitialized mr. Fixes: b9247544c1bc ("net/smc: convert static link ID instances to support multiple links") Signed-off-by: Wen Gu Link: https://lore.kernel.org/r/1663656189-32090-1-git-send-email-guwen@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0b833b73dd6f..f6d2ae6c24ae 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2254,7 +2254,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { - int i, rc = 0; + int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ mutex_lock(&lgr->llc_conf_mutex); @@ -2267,9 +2267,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, rc = -ENOMEM; goto out; } + cnt++; } out: mutex_unlock(&lgr->llc_conf_mutex); + if (!rc && !cnt) + rc = -EINVAL; return rc; } -- Gitee From d6febf11e63ec623312f93bbfa135794a0579b9f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 20 Sep 2022 17:52:21 +0800 Subject: [PATCH 08/76] net/smc: Introduce a specific sysctl for TEST_LINK time ANBZ: #3257 commit 77eee32514314209961af5c2982e871ecb364445 upstream. SMC-R tests the viability of link by sending out TEST_LINK LLC messages over RoCE fabric when connections on link have been idle for a time longer than keepalive interval (testlink time). But using tcp_keepalive_time as testlink time maybe not quite suitable because it is default no less than two hours[1], which is too long for single link to find peer dead. The active host will still use peer-dead link (QP) sending messages, and can't find out until get IB_WC_RETRY_EXC_ERR error CQEs, which takes more time than TEST_LINK timeout (SMC_LLC_WAIT_TIME) normally. So this patch introduces a independent sysctl for SMC-R to set link keepalive time, in order to detect link down in time. The default value is 30 seconds. [1] https://www.rfc-editor.org/rfc/rfc1122#page-101 Signed-off-by: Wen Gu Signed-off-by: Paolo Abeni Acked-by: Tony Lu --- Documentation/networking/smc-sysctl.rst | 7 +++++++ include/net/netns/smc.h | 1 + net/smc/smc_llc.c | 2 +- net/smc/smc_llc.h | 1 + net/smc/smc_sysctl.c | 10 +++++++++- 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index a93857e580b0..843c8cadb1e3 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -34,3 +34,10 @@ smcr_buf_type - INTEGER - 1 - Use virtually contiguous buffers - 2 - Mixed use of the two types. Try physically contiguous buffers first. If not available, use virtually contiguous buffers then. + +smcr_testlink_time - INTEGER + How frequently SMC-R link sends out TEST_LINK LLC messages to confirm + viability, after the last activity of connections on it. Value 0 means + disabling TEST_LINK. + + Default: 30 seconds. diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 2adbe2b245df..d295e2c10dca 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,5 +19,6 @@ struct netns_smc { #endif unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; + int sysctl_smcr_testlink_time; }; #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 6fea7a2f7e3b..988ae9777375 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -2126,7 +2126,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); mutex_init(&lgr->llc_conf_mutex); - lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); + lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); } /* called after lgr was removed from lgr_list */ diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 4404e52b3346..7e7a3162c68b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -19,6 +19,7 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ) enum smc_llc_reqresp { SMC_LLC_REQ, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 39b236f868bd..2354f7062b5f 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -16,6 +16,7 @@ #include "smc.h" #include "smc_core.h" +#include "smc_llc.h" #include "smc_sysctl.h" static int two = 2; @@ -37,6 +38,13 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "smcr_testlink_time", + .data = &init_net.smc.sysctl_smcr_testlink_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { } }; @@ -62,7 +70,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; - + net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; return 0; err_reg: -- Gitee From c806f7ff5b2065d5e1d99c7e00d5aec8f34255e6 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 20 Sep 2022 17:52:22 +0800 Subject: [PATCH 09/76] net/smc: Unbind r/w buffer size from clcsock and make them tunable ANBZ: #3257 commit 0227f058aa29f5ab6f6ec79c3a36ae41f1e03a13 upstream. Currently, SMC uses smc->sk.sk_{rcv|snd}buf to create buffers for send buffer and RMB. And the values of buffer size are from tcp_{w|r}mem in clcsock. The buffer size from TCP socket doesn't fit SMC well. Generally, buffers are usually larger than TCP for SMC-R/-D to get higher performance, for they are different underlay devices and paths. So this patch unbinds buffer size from TCP, and introduces two sysctl knobs to tune them independently. Also, these knobs are per net namespace and work for containers. Signed-off-by: Tony Lu Signed-off-by: Paolo Abeni --- Documentation/networking/smc-sysctl.rst | 18 ++++++++++++++++++ include/net/netns/smc.h | 2 ++ net/smc/af_smc.c | 5 ++--- net/smc/smc_core.c | 8 ++++---- net/smc/smc_sysctl.c | 20 ++++++++++++++++++++ 5 files changed, 46 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index 843c8cadb1e3..2c4b5c2181f7 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -41,3 +41,21 @@ smcr_testlink_time - INTEGER disabling TEST_LINK. Default: 30 seconds. + +wmem - INTEGER + Initial size of send buffer used by SMC sockets. + The default value inherits from net.ipv4.tcp_wmem[1]. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R and 1MiB for SMC-D. + + Default: 16K + +rmem - INTEGER + Initial size of receive buffer (RMB) used by SMC sockets. + The default value inherits from net.ipv4.tcp_rmem[1]. + + The minimum value is 16KiB and there is no hard limit for max value, but + only allowed 512KiB for SMC-R and 1MiB for SMC-D. + + Default: 128K diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index d295e2c10dca..582212ada3ba 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -20,5 +20,7 @@ struct netns_smc { unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; int sysctl_smcr_testlink_time; + int sysctl_wmem; + int sysctl_rmem; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3d66270d572d..1e0ce27fe29c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -379,6 +379,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -3252,9 +3254,6 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); - out: return rc; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f6d2ae6c24ae..5f163c1a4c46 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2325,10 +2325,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf / 2; + sk_buf_size = smc->sk.sk_rcvbuf; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf / 2; + sk_buf_size = smc->sk.sk_sndbuf; for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { @@ -2387,7 +2387,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize * 2; + smc->sk.sk_rcvbuf = bufsize; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -2395,7 +2395,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 2354f7062b5f..6ed67835c687 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -20,6 +20,8 @@ #include "smc_sysctl.h" static int two = 2; +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; static struct ctl_table smc_table[] = { { @@ -45,6 +47,22 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "wmem", + .data = &init_net.smc.sysctl_wmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem", + .data = &init_net.smc.sysctl_rmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, { } }; @@ -71,6 +89,8 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; + WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); return 0; err_reg: -- Gitee From 4bdfc5195615494b57cbeb36973bfc0adc1022f4 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 22 Sep 2022 20:19:07 +0800 Subject: [PATCH 10/76] net/smc: Support SO_REUSEPORT ANBZ: #3257 commit 6627a2074d5c82b3efd71c978f13f93f7ab9bf46 upstream. This enables SO_REUSEPORT [1] for clcsock when it is set on smc socket, so that some applications which uses it can be transparently replaced with SMC. Also, this helps improve load distribution. Here is a simple test of NGINX + wrk with SMC. The CPU usage is collected on NGINX (server) side as below. Disable SO_REUSEPORT: 05:15:33 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 05:15:34 PM all 7.02 0.00 11.86 0.00 2.04 8.93 0.00 0.00 0.00 70.15 05:15:34 PM 0 0.00 0.00 0.00 0.00 16.00 70.00 0.00 0.00 0.00 14.00 05:15:34 PM 1 11.58 0.00 22.11 0.00 0.00 0.00 0.00 0.00 0.00 66.32 05:15:34 PM 2 1.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 98.00 05:15:34 PM 3 16.84 0.00 30.53 0.00 0.00 0.00 0.00 0.00 0.00 52.63 05:15:34 PM 4 28.72 0.00 44.68 0.00 0.00 0.00 0.00 0.00 0.00 26.60 05:15:34 PM 5 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 05:15:34 PM 6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 05:15:34 PM 7 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 Enable SO_REUSEPORT: 05:15:20 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 05:15:21 PM all 8.56 0.00 14.40 0.00 2.20 9.86 0.00 0.00 0.00 64.98 05:15:21 PM 0 0.00 0.00 4.08 0.00 14.29 76.53 0.00 0.00 0.00 5.10 05:15:21 PM 1 9.09 0.00 16.16 0.00 1.01 0.00 0.00 0.00 0.00 73.74 05:15:21 PM 2 9.38 0.00 16.67 0.00 1.04 0.00 0.00 0.00 0.00 72.92 05:15:21 PM 3 10.42 0.00 17.71 0.00 1.04 0.00 0.00 0.00 0.00 70.83 05:15:21 PM 4 9.57 0.00 15.96 0.00 0.00 0.00 0.00 0.00 0.00 74.47 05:15:21 PM 5 9.18 0.00 15.31 0.00 0.00 1.02 0.00 0.00 0.00 74.49 05:15:21 PM 6 8.60 0.00 15.05 0.00 0.00 0.00 0.00 0.00 0.00 76.34 05:15:21 PM 7 12.37 0.00 14.43 0.00 0.00 0.00 0.00 0.00 0.00 73.20 Using SO_REUSEPORT helps the load distribution of NGINX be more balanced. [1] https://man7.org/linux/man-pages/man7/socket.7.html Signed-off-by: Tony Lu Acked-by: Wenjia Zhang Link: https://lore.kernel.org/r/20220922121906.72406-1-tonylu@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1e0ce27fe29c..462508b5fc47 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -429,6 +429,7 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; + smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: -- Gitee From d1eaff6acdfd89841f1440913fe57f8a856e8879 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 14 Oct 2022 12:34:36 +0300 Subject: [PATCH 11/76] net/smc: Fix an error code in smc_lgr_create() ANBZ: #3257 commit bdee15e8c58b450ad736a2b62ef8c7a12548b704 upstream. If smc_wr_alloc_lgr_mem() fails then return an error code. Don't return success. Fixes: 8799e310fb3f ("net/smc: add v2 support to the work request layer") Signed-off-by: Dan Carpenter Reviewed-by: Wenjia Zhang Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5f163c1a4c46..cd2c36ee7e8c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -909,7 +909,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); - if (smc_wr_alloc_lgr_mem(lgr)) + rc = smc_wr_alloc_lgr_mem(lgr); + if (rc) goto free_wq; smc_llc_lgr_init(lgr, smc); -- Gitee From bea486193a11f710f698a130c9bfd2227f03be41 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 1 Nov 2022 17:37:22 +0800 Subject: [PATCH 12/76] net/smc: Fix possible leaked pernet namespace in smc_init() ANBZ: #3257 commit 62ff373da2534534c55debe6c724c7fe14adb97f upstream. In smc_init(), register_pernet_subsys(&smc_net_stat_ops) is called without any error handling. If it fails, registering of &smc_net_ops won't be reverted. And if smc_nl_init() fails, &smc_net_stat_ops itself won't be reverted. This leaves wild ops in subsystem linkedlist and when another module tries to call register_pernet_operations() it triggers page fault: BUG: unable to handle page fault for address: fffffbfff81b964c RIP: 0010:register_pernet_operations+0x1b9/0x5f0 Call Trace: register_pernet_subsys+0x29/0x40 ebtables_init+0x58/0x1000 [ebtables] ... Fixes: 194730a9beb5 ("net/smc: Make SMC statistics network namespace aware") Signed-off-by: Chen Zhongjin Reviewed-by: Tony Lu Reviewed-by: Wenjia Zhang Link: https://lore.kernel.org/r/20221101093722.127223-1-chenzhongjin@huawei.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 462508b5fc47..6924c345606f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3379,14 +3379,14 @@ static int __init smc_init(void) rc = register_pernet_subsys(&smc_net_stat_ops); if (rc) - return rc; + goto out_pernet_subsys; smc_ism_init(); smc_clc_init(); rc = smc_nl_init(); if (rc) - goto out_pernet_subsys; + goto out_pernet_subsys_stat; rc = smc_pnet_init(); if (rc) @@ -3479,6 +3479,8 @@ static int __init smc_init(void) smc_pnet_exit(); out_nl: smc_nl_exit(); +out_pernet_subsys_stat: + unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); -- Gitee From c4162c55c9f96f5ba69fed8fffedb3514fa22882 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:15:06 +0800 Subject: [PATCH 13/76] anolis: net/smc: Expose SMCPROTO_SMC and SMCPROTO_SMC6 to userspace ANBZ: #1742 This patch exposes SMCPROTO_SMC and SMCPROTO_SMC6 to userspace by moving them to in.h and in6.h. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/in.h | 3 +++ include/uapi/linux/in6.h | 2 ++ net/smc/smc.h | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index d1b327036ae4..40b1e51b18c9 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,6 +84,9 @@ enum { }; #endif +/* SMC protocol, IPv4 */ +#define SMCPROTO_SMC 0 + #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 5ad396a57eb3..6c21c85be0e3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,6 +95,8 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 +/* SMC protocol, IPv6 */ +#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/net/smc/smc.h b/net/smc/smc.h index 5ed765ea0c73..0f1a51ae6d15 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -22,10 +22,6 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 - -#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ -#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ - #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ -- Gitee From b5195122e54faad4d1ce3e6cbcf25783a583b5b7 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:16:24 +0800 Subject: [PATCH 14/76] anolis: net/smc: Introduce sysctl tcp2smc ANBZ: #1742 This patch adds sysctl 'tcp2smc' to provide a switch for replacing TCP to SMC-R when new sockets are created in a specific net namespace. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/smc_sysctl.c | 8 ++++++++ net/socket.c | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 582212ada3ba..0f17768da473 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -22,5 +22,6 @@ struct netns_smc { int sysctl_smcr_testlink_time; int sysctl_wmem; int sysctl_rmem; + int sysctl_tcp2smc; }; #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 6ed67835c687..faaa795537c4 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -63,6 +63,13 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, + { + .procname = "tcp2smc", + .data = &init_net.smc.sysctl_tcp2smc, + .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -91,6 +98,7 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + net->smc.sysctl_tcp2smc = 0; return 0; err_reg: diff --git a/net/socket.c b/net/socket.c index d52c265ad449..96860a0f9330 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1367,6 +1367,14 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } +#if IS_ENABLED(CONFIG_SMC) + if (!kern && (family == AF_INET || family == AF_INET6) && + type == SOCK_STREAM && (protocol == IPPROTO_IP || + protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { + protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + family = AF_SMC; + } +#endif err = security_socket_create(family, type, protocol, kern); if (err) -- Gitee From 4e905160740f05c98042124fe2e1fadb7e95216d Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:33:42 +0800 Subject: [PATCH 15/76] anolis: net/smc: Introduce SMC-R-related proc files ANBZ: #1742 This patch introduces SMC-R proc files to report statistics information of SMC-R connections. Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/net_namespace.h | 1 + include/net/smc.h | 5 +- net/smc/Makefile | 2 +- net/smc/af_smc.c | 25 +++- net/smc/smc_diag.c | 29 ++-- net/smc/smc_proc.c | 287 ++++++++++++++++++++++++++++++++++++ net/smc/smc_proc.h | 34 +++++ 7 files changed, 362 insertions(+), 21 deletions(-) create mode 100644 net/smc/smc_proc.c create mode 100644 net/smc/smc_proc.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 76e9cce289a4..220878bfe86b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,6 +95,7 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; + struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/smc.h b/include/net/smc.h index 421a7197b475..ed406e50d50e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -12,10 +12,13 @@ #define _SMC_H #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ +#define SMC_HTABLE_SHIFT 9 +#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { + unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht; + struct hlist_head ht[SMC_HTABLE_SIZE]; }; int smc_hash_sk(struct sock *sk); diff --git a/net/smc/Makefile b/net/smc/Makefile index 875efcd126a2..956810a09da9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_proc.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6924c345606f..e15fb07e4535 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -52,6 +52,7 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -181,11 +182,13 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - head = &h->ht; - write_lock_bh(&h->lock); + + head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); return 0; @@ -3371,7 +3374,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc; + int rc, i; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3441,8 +3444,11 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); + + for (i = 0; i < SMC_HTABLE_SIZE; i++) { + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); + } rc = smc_ib_register_client(); if (rc) { @@ -3456,9 +3462,17 @@ static int __init smc_init(void) goto out_ib; } + rc = smc_proc_init(); + if (rc) { + pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); + goto out_ulp; + } + static_branch_enable(&tcp_have_smc); return 0; +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3491,6 +3505,7 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); + smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 22d38206ed48..6edc739f8e08 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,24 +196,25 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0; + int rc = 0, num = 0, slot; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - head = &prot->h.smc_hash->ht; - if (hlist_empty(head)) - goto out; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + + for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { + head = &prot->h.smc_hash->ht[slot]; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; + num++; + } } out: diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c new file mode 100644 index 000000000000..19d8cc82a7ac --- /dev/null +++ b/net/smc/smc_proc.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include "smc.h" +#include "smc_proc.h" +#include "smc_core.h" + +static void *smc_get_next(struct seq_file *seq, void *cur) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + struct net *net = seq_file_net(seq); + struct hlist_head *head; + struct sock *sk = cur; + + if (!sk) { + read_lock(&smc_hash->lock); +get_head: + head = &smc_hash->ht[sp->bucket]; + sk = sk_head(head); + sp->offset = 0; + goto get_sk; + } + ++sp->num; + ++sp->offset; + + sk = sk_next(sk); +get_sk: + sk_for_each_from(sk) { + if (!net_eq(sock_net(sk), net)) + continue; + return sk; + } + sp->offset = 0; + if (++sp->bucket < SMC_HTABLE_SIZE) + goto get_head; + + read_unlock(&smc_hash->lock); + return NULL; +} + +static void *smc_seek_last_pos(struct seq_file *seq) +{ + struct smc_proc_private *sp = seq->private; + int offset = sp->offset; + int orig_num = sp->num; + void *rc = NULL; + + if (sp->bucket >= SMC_HTABLE_SIZE) + goto out; + + rc = smc_get_next(seq, NULL); + while (offset-- && rc) + rc = smc_get_next(seq, rc); + + if (rc) + goto out; + + sp->bucket = 0; +out: + sp->num = orig_num; + return rc; +} + +static void *smc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + sp->bucket = 0; + rc = smc_get_next(seq, NULL); + + while (rc && pos) { + rc = smc_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) +{ + struct smc_proc_private *sp = seq->private; + void *rc; + + if (*pos && *pos == sp->last_pos) { + rc = smc_seek_last_pos(seq); + if (rc) + goto out; + } + + sp->num = 0; + sp->bucket = 0; + sp->offset = 0; + sp->protocol = protocol; + rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + sp->last_pos = *pos; + return rc; +} + +static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC); +} + +static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) +{ + return _smc_conn_start(seq, pos, SMCPROTO_SMC6); +} + +static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) +{ + struct smc_proc_private *sp = seq->private; + const struct in6_addr *dest, *src; + struct smc_link_group *lgr; + struct socket *clcsock; + struct smc_link *lnk; + struct sock *sk; + bool fb = false; + int i; + + fb = smc->use_fallback; + clcsock = smc->clcsock; + sk = &smc->sk; + + if (protocol == SMCPROTO_SMC) + seq_printf(seq, CONN4_ADDR_FM, sp->num, + clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, + clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); + else if (protocol == SMCPROTO_SMC6) { + dest = &clcsock->sk->sk_v6_daddr; + src = &clcsock->sk->sk_v6_rcv_saddr; + seq_printf(seq, CONN6_ADDR_FM, sp->num, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); + } + + seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, + sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); + + lgr = smc->conn.lgr; + lnk = smc->conn.lnk; + + if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { + for (i = 0; i < SMC_LGR_ID_SIZE; i++) + seq_printf(seq, "%02X", lgr->id[i]); + + seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', + lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, + lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + } else { + seq_puts(seq, "- - - - - - - -\n"); + } +} + +static int smc_conn_show(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct socket *clcsock; + struct smc_sock *smc; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, + "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", + "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", + "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + goto out; + } + + smc = smc_sk(v); + clcsock = smc->clcsock; + if (!clcsock) + goto out; + + _conn_show(seq, smc, sp->protocol); +out: + return 0; +} + +static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct smc_proc_private *sp = seq->private; + void *rc = NULL; + + if (v == SEQ_START_TOKEN) { + rc = smc_get_idx(seq, 0); + goto out; + } + rc = smc_get_next(seq, v); +out: + ++*pos; + sp->last_pos = *pos; + return rc; +} + +static void smc_conn_stop(struct seq_file *seq, void *v) +{ + struct smc_proc_private *sp = seq->private; + struct smc_hashinfo *smc_hash = + sp->protocol == SMCPROTO_SMC ? + smc_proto.h.smc_hash : smc_proto6.h.smc_hash; + + if (v && v != SEQ_START_TOKEN) + read_unlock(&smc_hash->lock); +} + +static struct smc_proc_entry smc_proc[] = { + { + .name = "smc4", + .ops = { + .show = smc_conn_show, + .start = smc_conn4_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .name = "smc6", + .ops = { + .show = smc_conn_show, + .start = smc_conn6_start, + .next = smc_conn_next, + .stop = smc_conn_stop, + }, + }, +#endif +}; + +static int __net_init smc_proc_dir_init(struct net *net) +{ + int i, rc = -ENOMEM; + + net->proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); + if (!net->proc_net_smc) + goto err; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { + if (!proc_create_net_data(smc_proc[i].name, 0444, + net->proc_net_smc, &smc_proc[i].ops, + sizeof(struct smc_proc_private), + NULL)) + goto err_entry; + } + + return 0; + +err_entry: + for (i -= 1; i >= 0; i--) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +err: + return rc; +} + +static void __net_exit smc_proc_dir_exit(struct net *net) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) + remove_proc_entry(smc_proc[i].name, net->proc_net_smc); + + remove_proc_entry("smc", net->proc_net); +} + +static struct pernet_operations smc_proc_ops = { + .init = smc_proc_dir_init, + .exit = smc_proc_dir_exit, +}; + +int __init smc_proc_init(void) +{ + return register_pernet_subsys(&smc_proc_ops); +} + +void smc_proc_exit(void) +{ + unregister_pernet_subsys(&smc_proc_ops); +} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h new file mode 100644 index 000000000000..ec59ca03e163 --- /dev/null +++ b/net/smc/smc_proc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _SMC_PROC_H_ +#define _SMC_PROC_H_ + +#include +#include +#include +#include +#include +#include "smc.h" + +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") +#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") + +struct smc_proc_private { + struct seq_net_private p; + int num, bucket, offset; + int protocol; + loff_t last_pos; +}; + +struct smc_proc_entry { + const char *name; + const struct seq_operations ops; +}; + +int __init smc_proc_init(void); +void smc_proc_exit(void); + +#endif -- Gitee From 2bab4d8ded9b17e47a7cb9c566e401fe96da0b66 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:36:07 +0800 Subject: [PATCH 16/76] anolis: net/smc: Introduce TCP to SMC replacement netlink commands ANBZ: #1742 This patch introduces new SMC-R generic netlink commands SMC_NETLINK_{ ADD | DEL | GET }_TCP2SMC_WLIST to add | delete | get application-oriented TCP-to-SMC replacement white list. Comparison between Average time cost of creating or destroying 2000 TCP connections in different situations: 1) Without this patch and remove the patch which introduces TCP2SMC sysctl: Average creation time cost: 1106 us; Average destruction time cost: 6 us; 2) With this patch but not load SMC module: Average creation time cost: 1161 us; Average destruction time cost: 6 us; 3) With this patch and load SMC module: Average creation time cost: 1157 us; Average destruction time cost: 6 us; 4) With this patch, load SMC module and add 2 elements in TCP2SMC conversion white list: Average creation time cost: 1177 us; Average destruction time cost: 6 us; Signed-off-by: Wen Gu Reviewed-by: Xuan Zhuo Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 9 +- include/uapi/linux/smc.h | 3 + net/smc/Makefile | 2 +- net/smc/af_smc.c | 10 +++ net/smc/smc_conv.c | 186 +++++++++++++++++++++++++++++++++++++++ net/smc/smc_conv.h | 22 +++++ net/smc/smc_netlink.c | 19 +++- net/smc/smc_netlink.h | 5 ++ net/socket.c | 39 ++++++-- 9 files changed, 286 insertions(+), 9 deletions(-) create mode 100644 net/smc/smc_conv.c create mode 100644 net/smc/smc_conv.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 0f17768da473..23fb129b75c8 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -6,14 +6,21 @@ struct smc_stats_rsn; struct smc_stats; +struct smc_convert { + int wlist_len; + struct mutex wlist_lock; + struct list_head wlist; + int (*smc_conv_match_rcu)(struct net *net, char *comm); +}; + struct netns_smc { /* per cpu counters for SMC */ struct smc_stats __percpu *smc_stats; /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - bool limit_smc_hs; /* constraint on handshake */ + struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index bb4dacca31e7..4ec01eb8215e 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -62,6 +62,9 @@ enum { SMC_NETLINK_DUMP_HS_LIMITATION, SMC_NETLINK_ENABLE_HS_LIMITATION, SMC_NETLINK_DISABLE_HS_LIMITATION, + SMC_NETLINK_ADD_TCP2SMC_WLIST, + SMC_NETLINK_DEL_TCP2SMC_WLIST, + SMC_NETLINK_GET_TCP2SMC_WLIST, }; /* SMC_GENL_FAMILY top level attributes */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 956810a09da9..bd6f807ff803 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o +smc-y += smc_tracepoint.o smc_proc.o smc_conv.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e15fb07e4535..cc1d9a2957f0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,7 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_proc.h" +#include "smc_conv.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -3468,9 +3469,17 @@ static int __init smc_init(void) goto out_ulp; } + rc = smc_conv_init(); + if (rc) { + pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); + goto out_proc; + } + static_branch_enable(&tcp_have_smc); return 0; +out_proc: + smc_proc_exit(); out_ulp: tcp_unregister_ulp(&smc_ulp_ops); out_ib: @@ -3505,6 +3514,7 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); + smc_conv_exit(); smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_conv.c b/net/smc/smc_conv.c new file mode 100644 index 000000000000..e1f87d1de8a5 --- /dev/null +++ b/net/smc/smc_conv.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include "smc_netlink.h" +#include "smc_conv.h" + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *wlist_elem, *tmp; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + if (*wlist_len >= SMC_MAX_WLIST_LEN) { + mutex_unlock(wlist_lock); + return -EINVAL; + } + + list_for_each_entry(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) + goto out; + } + + wlist_elem = kmalloc(sizeof(*wlist_elem), GFP_KERNEL); + if (!wlist_elem) { + mutex_unlock(wlist_lock); + return -ENOMEM; + } + + strcpy(wlist_elem->task_comm, msg); + list_add_tail_rcu(&wlist_elem->list, wlist); + ++*wlist_len; +out: + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *tmp, *nxt; + char msg[TASK_COMM_LEN]; + struct nlattr *na; + + na = info->attrs[SMC_CMD_ATTR_TCP2SMC]; + if (!na) + return -EINVAL; + + nla_strlcpy(msg, na, TASK_COMM_LEN); + + mutex_lock(wlist_lock); + list_for_each_entry_safe(tmp, nxt, wlist, list) { + if (!strcmp(tmp->task_comm, msg)) { + list_del_rcu(&tmp->list); + synchronize_rcu(); + kfree(tmp); + --*wlist_len; + break; + } + } + mutex_unlock(wlist_lock); + return 0; +} + +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_conv_wlist_elem *tmp; + void *nlh; + + if (cb_ctx->pos[0]) + goto errmsg; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_TCP2SMC_WLIST); + if (!nlh) + goto errmsg; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (nla_put(skb, SMC_CMD_ATTR_TCP2SMC, + nla_total_size(strlen(tmp->task_comm) + 1), + tmp->task_comm)) { + rcu_read_unlock(); + goto errattr; + } + } + rcu_read_unlock(); + + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_match_tcp2smc_wlist(struct net *net, char *comm) +{ + struct list_head *wlist = &net->smc.smc_conv.wlist; + struct smc_conv_wlist_elem *tmp; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, wlist, list) { + if (!strcmp(tmp->task_comm, comm)) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + return -1; +} + +static int __net_init smc_net_conv_init(struct net *net) +{ + INIT_LIST_HEAD_RCU(&net->smc.smc_conv.wlist); + net->smc.smc_conv.wlist_len = 0; + + mutex_init(&net->smc.smc_conv.wlist_lock); + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, + smc_match_tcp2smc_wlist); + return 0; +} + +static void __net_exit smc_net_conv_exit(struct net *net) +{ + struct mutex *wlist_lock = &net->smc.smc_conv.wlist_lock; + struct list_head *wlist = &net->smc.smc_conv.wlist; + int *wlist_len = &net->smc.smc_conv.wlist_len; + struct smc_conv_wlist_elem *cur, *nxt; + struct list_head tmp_list; + + rcu_assign_pointer(net->smc.smc_conv.smc_conv_match_rcu, NULL); + synchronize_rcu(); + + INIT_LIST_HEAD(&tmp_list); + + mutex_lock(wlist_lock); + list_splice_init_rcu(wlist, &tmp_list, synchronize_rcu); + *wlist_len = 0; + mutex_unlock(wlist_lock); + + list_for_each_entry_safe(cur, nxt, &tmp_list, list) { + list_del(&cur->list); + kfree(cur); + } +} + +static struct pernet_operations smc_conv_ops = { + .init = smc_net_conv_init, + .exit = smc_net_conv_exit, +}; + +int __init smc_conv_init(void) +{ + return register_pernet_subsys(&smc_conv_ops); +} + +void smc_conv_exit(void) +{ + unregister_pernet_subsys(&smc_conv_ops); +} diff --git a/net/smc/smc_conv.h b/net/smc/smc_conv.h new file mode 100644 index 000000000000..1615b27feede --- /dev/null +++ b/net/smc/smc_conv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef NET_SMC_SMC_CONV_H_ +#define NET_SMC_SMC_CONV_H_ +#include +#include +#include + +#define SMC_MAX_WLIST_LEN 32 + +struct smc_conv_wlist_elem { + char task_comm[TASK_COMM_LEN]; + struct list_head list; +}; + +int smc_nl_add_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_del_tcp2smc_wlist(struct sk_buff *skb, struct genl_info *info); +int smc_nl_get_tcp2smc_wlist(struct sk_buff *skb, struct netlink_callback *cb); +int __init smc_conv_init(void); +void smc_conv_exit(void); + +#endif /* NET_SMC_SMC_CONV_H_ */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index c5a62f6f52ba..52dba083b70e 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -22,6 +22,7 @@ #include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" +#include "smc_conv.h" const struct nla_policy smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { @@ -126,9 +127,25 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_hs_limitation, }, + { + .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_add_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_DEL_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .doit = smc_nl_del_tcp2smc_wlist, + }, + { + .cmd = SMC_NETLINK_GET_TCP2SMC_WLIST, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_tcp2smc_wlist, + }, }; -static const struct nla_policy smc_gen_nl_policy[2] = { +static const struct nla_policy smc_gen_nl_policy[SMC_CMD_MAX_ATTR + 1] = { + [SMC_CMD_ATTR_TCP2SMC] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN - 1 }, [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, }; diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index e8c6c3f0e98c..aae13737095e 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -15,6 +15,11 @@ #include #include +enum { + SMC_CMD_ATTR_TCP2SMC = 1, + SMC_CMD_MAX_ATTR, +}; + extern struct genl_family smc_gen_nl_family; extern const struct nla_policy smc_gen_ueid_policy[]; diff --git a/net/socket.c b/net/socket.c index 96860a0f9330..3917e02b2b2f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -141,6 +141,38 @@ static void sock_show_fdinfo(struct seq_file *m, struct file *f) #define sock_show_fdinfo NULL #endif +#if IS_ENABLED(CONFIG_SMC) +static bool try_tcp2smc_convert(struct net *net, int *family, int type, + int *protocol, int kern) +{ + int (*f)(struct net *n, char *c) = NULL; + + /* Only convert userspace socket */ + if (kern) + return false; + + if ((*family == AF_INET || *family == AF_INET6) && + type == SOCK_STREAM && + (*protocol == IPPROTO_IP || *protocol == IPPROTO_TCP)) { + if (net->smc.sysctl_tcp2smc) + goto convert; + + rcu_read_lock(); + f = rcu_dereference(net->smc.smc_conv.smc_conv_match_rcu); + if (f && !f(net, current->comm)) { + rcu_read_unlock(); + goto convert; + } + rcu_read_unlock(); + } + return false; +convert: + *protocol = (*family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; + *family = AF_SMC; + return true; +} +#endif + /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -1368,12 +1400,7 @@ int __sock_create(struct net *net, int family, int type, int protocol, family = PF_PACKET; } #if IS_ENABLED(CONFIG_SMC) - if (!kern && (family == AF_INET || family == AF_INET6) && - type == SOCK_STREAM && (protocol == IPPROTO_IP || - protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { - protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - family = AF_SMC; - } + try_tcp2smc_convert(net, &family, type, &protocol, kern); #endif err = security_socket_create(family, type, protocol, kern); -- Gitee From e9f9090e9e59d51ea46d250036cc0d94d8c9462b Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 8 Dec 2021 20:41:05 +0800 Subject: [PATCH 17/76] anolis: net/smc: Add TX and RX diagnosis information ANBZ: #1742 This patch adds RX / TX execution and data size counters for each SMC connection which will be reported in diagnosis information. Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/uapi/linux/smc_diag.h | 6 ++++++ net/smc/smc.h | 6 ++++++ net/smc/smc_core.c | 15 +++++++++++++++ net/smc/smc_diag.c | 6 ++++++ net/smc/smc_rx.c | 2 ++ net/smc/smc_tx.c | 8 +++++++- 6 files changed, 42 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 8cb3a6fef553..182efdd3ec91 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -79,6 +79,12 @@ struct smc_diag_conninfo { struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ struct smc_diag_cursor tx_sent; /* sent cursor */ struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ + __u64 rx_cnt; /* rx counter */ + __u64 tx_cnt; /* tx counter */ + __u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + __u64 rx_bytes; /* rx size */ + __u64 tx_bytes; /* tx size */ + __u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ }; /* SMC_DIAG_LINKINFO */ diff --git a/net/smc/smc.h b/net/smc/smc.h index 0f1a51ae6d15..9d73fc5fdbc2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -228,6 +228,12 @@ struct smc_connection { u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ + u64 rx_cnt; /* rx counter */ + u64 tx_cnt; /* tx counter */ + u64 tx_corked_cnt; /* tx counter with MSG_MORE flag or corked */ + u64 rx_bytes; /* rx size */ + u64 tx_bytes; /* tx size */ + u64 tx_corked_bytes;/* tx size with MSG_MORE flag or corked */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termiation */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index cd2c36ee7e8c..25dd7bd01162 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1862,6 +1862,20 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } +static void smc_rx_tx_counter_init(struct smc_connection *conn) +{ + /* Initialize RX & TX diagnostic inform for each + * connection. These counters mean what smc wants + * net devices "TODO" insead of what has been "DONE" + */ + conn->rx_cnt = 0; + conn->tx_cnt = 0; + conn->tx_corked_cnt = 0; + conn->rx_bytes = 0; + conn->tx_bytes = 0; + conn->tx_corked_bytes = 0; +} + /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -1946,6 +1960,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); + smc_rx_tx_counter_init(conn); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 6edc739f8e08..fea662f95bd8 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,6 +136,12 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, + .rx_cnt = conn->rx_cnt, + .tx_cnt = conn->tx_cnt, + .tx_corked_cnt = conn->tx_corked_cnt, + .rx_bytes = conn->rx_bytes, + .tx_bytes = conn->tx_bytes, + .tx_corked_bytes = conn->tx_corked_bytes, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 17c5aee7ee4f..4b548e118268 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -450,6 +450,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable--; /* always stop at urgent Byte */ /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); + conn->rx_bytes += copylen; /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - @@ -497,6 +498,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } trace_smc_rx_recvmsg(smc, copylen); + ++conn->rx_cnt; } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 64dedffe9d26..e1831cfc0ae5 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -282,8 +282,14 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ - if (!smc_tx_should_cork(smc, msg)) + if (!smc_tx_should_cork(smc, msg)) { + conn->tx_bytes += copylen; + ++conn->tx_cnt; smc_tx_sndbuf_nonempty(conn); + } else { + conn->tx_corked_bytes += copylen; + ++conn->tx_corked_cnt; + } trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ -- Gitee From 2c02c53dd0112b93bcf29fcb3373656334cf8b2a Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 2 Sep 2021 13:19:26 +0800 Subject: [PATCH 18/76] anolis: net/smc: don't call ib_req_notify_cq in the send routine ANBZ: #1742 We can just call ib_req_notify_cq() when the link got ready, and rearm it after poll_cq(). Which is enough to make sure we won't miss any events. Simple sockperf test show about 20% gain in throughput test with small messages. Test command: client: smc_run sockperf tp -i $SERVER -m 14 -t 30 --tcp server: smc_run sockperf sr --tcp Without this: Summary: BandWidth is 6.504 MBps (52.034 Mbps) With this: Summary: BandWidth is 7.846 MBps (62.771 Mbps) Signed-off-by: Dust Li Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 6 ++++++ net/smc/smc_wr.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 1cb600767e88..ef4fea545d0f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -135,6 +135,12 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; + + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 26f8f240d9e8..261d8b44d275 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -306,8 +306,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 1eb0dcc9ac5571d5293ef1d41d7c9b7c883b1bb6 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 22 Sep 2021 11:17:18 +0800 Subject: [PATCH 19/76] anolis: net/smc: allow different subnet communication ANBZ: #1742 SMC checks prefix to ensure that peers are in the same subnet. But it is no need to check this for iWARP over ERDMA, for ERDMA can communicate each others beyound subnet. So we provide a sysctl knob allow_different_subnet to support it. Signed-off-by: Tony Lu Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 11 +++++++---- net/smc/smc_sysctl.c | 10 ++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 23fb129b75c8..2237977565ec 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -30,5 +30,6 @@ struct netns_smc { int sysctl_wmem; int sysctl_rmem; int sysctl_tcp2smc; + int sysctl_allow_different_subnet; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cc1d9a2957f0..c7f11c2264bb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2247,6 +2247,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { + struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -2254,10 +2255,12 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + if (!net->smc.sysctl_allow_different_subnet) { + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); + } /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index faaa795537c4..e0687a500e2f 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -70,6 +70,15 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "allow_different_subnet", + .data = &init_net.smc.sysctl_allow_different_subnet, + .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -99,6 +108,7 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_allow_different_subnet = 1; return 0; err_reg: -- Gitee From 3195bd74bc3d6e922334381312d2151e2a0901fb Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 16 Dec 2021 17:38:05 +0800 Subject: [PATCH 20/76] anolis: net/smc: Avoid unmapping bufs from unused links ANBZ: #1742 ANBZ: #264 smcr_buf_free() intends to unmap each link of link group from a specific buf_desc according to lnk->link_idx. However, if the link has already been cleared before, its lnk->link_idx is 0 and smcr_buf_unmap_link() will repeatedly try to unmap lnk[0] from a buf_desc. The wrong lnk->link_idx won't cause any problems currently because unused links has unmapped bufs from itself in smcr_link_clear(). But the wrong lnk->link_idx doesn't match the semantic, so it is better to avoid ummapping an unused link. Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 25dd7bd01162..10a7b330e626 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1308,8 +1308,11 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + continue; smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); + } if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); -- Gitee From 570654c01086a8d62ba0521b232fb7af37c2b2c5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 11 Feb 2022 18:12:25 +0800 Subject: [PATCH 21/76] anolis: net/smc: Add sysctl conrtol for handshake limiation ANBZ: #1742 ANBZ: #264 see commit: net/smc: Add global configure for handshake limitation by netlink This patch just add sysctl contoler for anolis. Signed-off-by: D. Wythe Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 +- net/smc/smc_sysctl.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 2237977565ec..5bcce341b011 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,7 +19,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - bool limit_smc_hs; /* constraint on handshake */ + int limit_smc_hs; /* constraint on handshake */ struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index e0687a500e2f..efadc6e5bacb 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -79,6 +79,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "limit_handshake", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(init_net.smc.limit_smc_hs), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- Gitee From 70f41b9eaedceabcf2e6bf15c5e3306090f0049c Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 00:15:51 +0800 Subject: [PATCH 22/76] anolis: net/smc: Support rq flow control in smc-r link layer ANBZ: #1742 ANBZ: #254 This patch supports rq flow control in smc-r link layer. QPs communicating without rq flow control, in the previous version, may result in RNR (reveive not ready) error, which means when sq sends a message to the remote qp, but the remote qp's rq has no rq entities to receive the message. In RNR situation, the rdma transport layer may retransmit the messages again and again until the rq has any entities, which may lower the performance, especially in heavy traffic. Using credits to do rq flow control can avoid the occurrence of RNR. The test of redis-benchmark shows that more than 3X rps improvement in SET and more than 7X rps improvement in GET. Test command: redis-server --save "" --appendonly no --protected-mode no --io-threads 7 --io-threads-do-reads yes redis-benchmark -h 192.168.26.36 -q -t set,get -P 1 --threads 7 -n 2000000 -c 500 -d 10 Before: SET: 173325.25 requests per second, p50=2.703 msec GET: 81383.52 requests per second, p50=5.575 msec After: SET: 554323.69 requests per second, p50=0.959 msec GET: 604741.19 requests per second, p50=0.855 msec Signed-off-by: Guangguan Wang Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 12 ++++++ net/smc/smc_cdc.c | 12 +++++- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 ++ net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 ++++++++- net/smc/smc_ib.c | 6 ++- net/smc/smc_llc.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 5 +++ net/smc/smc_wr.c | 31 +++++++++++++--- net/smc/smc_wr.h | 54 ++++++++++++++++++++++++++- 11 files changed, 223 insertions(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c7f11c2264bb..732bac33cdd4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -703,6 +703,13 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; + link->credits_enable = clc->r0.init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, clc->r0.init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); + } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -1258,6 +1265,11 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { + reason_code = SMC_CLC_DECL_CREDITSERR; + goto connect_abort; + } + /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 53f63bfbaf5f..410134dccbf9 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,25 +111,30 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; + struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; + u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + smc_host_msg_to_cdc(cdc_msg, conn, &cfed); + saved_credits = (u8)smc_wr_rx_get_credits(link); + cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -445,6 +450,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ + if (cdc->credits) + smc_wr_tx_put_credits(link, cdc->credits, true); + /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303..145ce7997e64 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,7 +47,8 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 reserved[18]; + u8 credits; /* credits synced by every cdc msg */ + u8 reserved[17]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1472f31480d8..ba20049ef6ce 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1040,9 +1040,12 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; + clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + clc->r0.init_credits = + link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fee545c9a10..7b068f7e0519 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,6 +63,7 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -190,7 +191,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 reserved; + u8 init_credits; /* QP rq init credits for rq flowctrl */ __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f9b7dd15479d..7f53309ad796 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,7 +21,12 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, + * SMC_WR_BUF_CNT should not be less than 2 * + * SMC_RMBS_PER_LGR_MAX, since every connection at + * least has two rq/sq credits in average, otherwise + * may result in waiting for credits in sending process. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -80,6 +85,8 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 +#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 + struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -124,6 +131,14 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ + atomic_t local_rq_credits; /* credits for local rq flowctrl */ + u8 credits_enable; /* credits enable flag, set when negotiation */ + u8 local_cr_watermark_high; /* local rq credits watermark */ + u8 peer_cr_watermark_low; /* peer rq credits watermark */ + struct work_struct credits_announce_work; /* work for credits announcement */ + unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index ef4fea545d0f..5a183b754851 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -670,10 +670,12 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND + * there are max. 2 RDMA_WRITE per 1 WR_SEND. + * RDMA_WRITE consumes send queue entities, + * without recv queue entities. */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, .max_inline_data = 0, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 988ae9777375..34b1028d8d70 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,7 +75,8 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 reserved[8]; + u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved[7]; }; struct smc_llc_msg_add_link_cont_rt { @@ -170,6 +171,12 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_announce_credits { /* type 0x0A */ + struct smc_llc_hdr hd; + u8 credits; + u8 reserved[39]; +}; + struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -189,6 +196,7 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; + struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -752,6 +760,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } +/* send credits announce request or response */ +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force) +{ + struct smc_llc_msg_announce_credits *announce_credits; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + u8 saved_credits = 0; + + if (!link->credits_enable || + (!force && !smc_wr_rx_credits_need_announce(link))) + return 0; + + saved_credits = (u8)smc_wr_rx_get_credits(link); + if (!saved_credits) + /* maybe synced by cdc msg */ + return 0; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) { + smc_wr_rx_put_credits(link, saved_credits); + return rc; + } + + announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; + memset(announce_credits, 0, sizeof(*announce_credits)); + announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; + announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); + if (reqresp == SMC_LLC_RESP) + announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; + announce_credits->credits = saved_credits; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + if (rc) + smc_wr_rx_put_credits(link, saved_credits); + + return rc; +} + /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1015,6 +1063,13 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; + link->credits_enable = add_llc->init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, add_llc->init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); + } } /* as an SMC client, process an add link request */ @@ -1935,6 +1990,10 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); + break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2020,6 +2079,10 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); + break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2113,6 +2176,27 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +static void smc_llc_announce_credits_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, + struct smc_link, credits_announce_work); + int rc, retry = 0, agains = 0; + +again: + do { + rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); + } while ((rc == -EBUSY) && smc_link_sendable(link) && + (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); + + if (smc_wr_rx_credits_need_announce(link) && + smc_link_sendable(link) && agains <= 5 && !rc) { + agains++; + goto again; + } + + clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); +} + void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2148,6 +2232,7 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2179,6 +2264,7 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); + cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2293,6 +2379,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ANNOUNCE_CREDITS + }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 7e7a3162c68b..d0c941e20bee 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -21,6 +21,8 @@ #define SMC_LLC_WAIT_TIME (2 * HZ) #define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ) +#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) + enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -36,6 +38,7 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -87,6 +90,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 261d8b44d275..55c1deb6bc7f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -130,7 +130,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - wake_up(&link->wr_tx_wait); + if (wq_has_sleeper(&link->wr_tx_wait)) + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) @@ -173,11 +174,16 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; + + if (!smc_wr_tx_get_credit(link)) + return -EBUSY; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; + smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -283,7 +289,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - wake_up(&link->wr_tx_wait); + smc_wr_tx_put_credits(link, 1, true); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -469,6 +475,12 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } + + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); + } } } @@ -511,6 +523,8 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); return rc; } @@ -545,7 +559,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_recv_wr); } @@ -737,7 +751,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -745,7 +759,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -764,7 +778,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -887,6 +901,11 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + atomic_set(&lnk->peer_rq_credits, 0); + atomic_set(&lnk->local_rq_credits, 0); + lnk->flags = 0; + lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); + lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..8cf276215c91 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,12 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -83,6 +88,51 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } +// get one tx credit, and peer rq credits dec +static inline int smc_wr_tx_get_credit(struct smc_link *link) +{ + return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; +} + +// put tx credits, when some failures occurred after tx credits got +// or receive announce credits msgs +static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) +{ + if (link->credits_enable && credits) { + atomic_add(credits, &link->peer_rq_credits); + if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) + wake_up_nr(&link->wr_tx_wait, credits); + } +} + +// to check whether peer rq credits is lower than watermark. +static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; +} + +// get local rq credits and set credits to zero. +// may called when announcing credits +static inline int smc_wr_rx_get_credits(struct smc_link *link) +{ + return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; +} + +// called when post_recv a rqe +static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) +{ + if (link->credits_enable && credits) + atomic_add(credits, &link->local_rq_credits); +} + +// to check whether local rq credits is higher than watermark. +static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -95,6 +145,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + if (!rc) + smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From 95a5084708262ddb44fbfae55c263c2391f94291 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 01:04:22 +0800 Subject: [PATCH 23/76] anolis: net/smc: Introduce link-related proc file ANBZ: #1742 ANBZ: #346 This patch introduces link-related proc files to report statistics information of SMC-R links. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_proc.c | 58 +++++++++++++++++++++++++++++++++++++++++++--- net/smc/smc_proc.h | 10 ++++---- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index 19d8cc82a7ac..106887b7b9e1 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -154,9 +154,11 @@ static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, + smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); } else { - seq_puts(seq, "- - - - - - - -\n"); + seq_puts(seq, "- - - - - - -" + " - - -\n"); } } @@ -170,7 +172,7 @@ static int smc_conn_show(struct seq_file *seq, void *v) seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); goto out; } @@ -234,6 +236,51 @@ static struct smc_proc_entry smc_proc[] = { #endif }; +extern struct smc_lgr_list smc_lgr_list; +static int proc_show_links(struct seq_file *seq, void *v) +{ + struct smc_link_group *lgr, *lg; + struct smc_link *lnk; + int i = 0, j = 0; + + seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", + "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", + "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!smc_link_usable(lnk)) + continue; + for (j = 0; j < SMC_LGR_ID_SIZE; j++) + seq_printf(seq, "%02X", lgr->id[j]); + seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", + lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, + lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, + lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, + lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, + atomic_read(&lnk->local_rq_credits), + atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, + lnk->peer_cr_watermark_low, lnk->flags); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + return 0; +} + +static int proc_open_links(struct inode *inode, struct file *file) +{ + single_open(file, proc_show_links, NULL); + return 0; +} + +static struct proc_ops link_file_ops = { +.proc_open = proc_open_links, +.proc_read = seq_read, +.proc_release = single_release, +}; + static int __net_init smc_proc_dir_init(struct net *net) { int i, rc = -ENOMEM; @@ -250,6 +297,9 @@ static int __net_init smc_proc_dir_init(struct net *net) goto err_entry; } + if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) + goto err_entry; + return 0; err_entry: @@ -265,6 +315,8 @@ static void __net_exit smc_proc_dir_exit(struct net *net) { int i; + remove_proc_entry("links", net->proc_net_smc); + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) remove_proc_entry(smc_proc[i].name, net->proc_net_smc); diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h index ec59ca03e163..faa5eaaee511 100644 --- a/net/smc/smc_proc.h +++ b/net/smc/smc_proc.h @@ -9,12 +9,14 @@ #include #include "smc.h" -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") #define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") #define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") struct smc_proc_private { struct seq_net_private p; -- Gitee From d53cadadc45a85b84fb8802d7a55f6e6061ce5e7 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 13 Jan 2022 17:06:19 +0800 Subject: [PATCH 24/76] anolis: net/smc: Introduce smc_ib_cq to bind link and cq ANBZ: #1742 ANBZ: #264 This patch introduces struct smc_ib_cq as a medium between smc_link and ib_cq. Every smc_link can access ib_cq from their own, and unbinds smc_link from smc_ib_device. This allows flexible mapping, prepares for multiple CQs support. Signed-off-by: Tony Lu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.h | 2 ++ net/smc/smc_ib.c | 86 ++++++++++++++++++++++++++++++++-------------- net/smc/smc_ib.h | 13 ++++--- net/smc/smc_wr.c | 32 ++++++++--------- 4 files changed, 88 insertions(+), 45 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 7f53309ad796..9130a6f87264 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,6 +94,8 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ + struct smc_ib_cq *smcibcq_recv; /* cq for recv */ + struct smc_ib_cq *smcibcq_send; /* cq for send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 5a183b754851..251cc60b7c8c 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -656,6 +656,8 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; + lnk->smcibcq_send = NULL; + lnk->smcibcq_recv = NULL; } /* create a queue pair within the protection domain for a link */ @@ -665,8 +667,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->roce_cq_send, - .recv_cq = lnk->smcibdev->roce_cq_recv, + .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, + .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -693,10 +695,13 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) + if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; - else + } else { + lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; + lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; smc_wr_remember_qp_attr(lnk); + } return rc; } @@ -843,10 +848,21 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } +static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) +{ + ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); + kfree(smcibdev->ib_cq_send); + smcibdev->ib_cq_send = NULL; + + ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); + kfree(smcibdev->ib_cq_recv); + smcibdev->ib_cq_recv = NULL; +} + long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { - .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; long rc; @@ -859,28 +875,49 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); - if (IS_ERR(smcibdev->roce_cq_send)) { - smcibdev->roce_cq_send = NULL; + smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); + if (!smcibcq_send) { + rc = -ENOMEM; + goto out; + } + smcibcq_send->smcibdev = smcibdev; + smcibcq_send->is_send = 1; + cqattr.comp_vector = 0; + smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq_send, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); + if (IS_ERR(smcibdev->ib_cq_send)) { + smcibdev->ib_cq_send = NULL; goto out; } - smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); - if (IS_ERR(smcibdev->roce_cq_recv)) { - smcibdev->roce_cq_recv = NULL; - goto err; + smcibdev->ib_cq_send = smcibcq_send; + + smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); + if (!smcibcq_recv) { + rc = -ENOMEM; + goto err_send; + } + smcibcq_recv->smcibdev = smcibdev; + cqattr.comp_vector = 1; + smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq_recv, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); + if (IS_ERR(smcibdev->ib_cq_recv)) { + smcibdev->ib_cq_recv = NULL; + goto err_recv; } + smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err: - ib_destroy_cq(smcibdev->roce_cq_send); +err_recv: + kfree(smcibcq_recv); + ib_destroy_cq(smcibcq_send->ib_cq); +err_send: + kfree(smcibcq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -892,8 +929,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - ib_destroy_cq(smcibdev->roce_cq_recv); - ib_destroy_cq(smcibdev->roce_cq_send); + smc_ib_cleanup_cq(smcibdev); smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 034295676e88..15b213f19c6e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,15 +32,20 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ +struct smc_ib_cq { /* ib_cq wrapper for smc */ + struct smc_ib_device *smcibdev; /* parent ib device */ + struct ib_cq *ib_cq; /* real ib_cq for link */ + struct tasklet_struct tasklet; /* tasklet for wr */ + bool is_send; /* send for recv cq */ +}; + struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct ib_cq *roce_cq_send; /* send completion queue */ - struct ib_cq *roce_cq_recv; /* recv completion queue */ - struct tasklet_struct send_tasklet; /* called by send cq handler */ - struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + struct smc_ib_cq *ib_cq_send; /* send completion queue */ + struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 55c1deb6bc7f..b30c23469704 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -136,7 +136,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -145,9 +145,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_send, + ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); } @@ -162,9 +162,9 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->send_tasklet); + tasklet_schedule(&smcibcq->tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -327,7 +327,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -371,7 +371,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -486,7 +486,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -495,9 +495,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_recv, + ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); } @@ -511,9 +511,9 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->recv_tasklet); + tasklet_schedule(&smcibcq->tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -845,14 +845,14 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->recv_tasklet); - tasklet_kill(&smcibdev->send_tasklet); + tasklet_kill(&smcibdev->ib_cq_recv->tasklet); + tasklet_kill(&smcibdev->ib_cq_send->tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 66fa398f4f5b8387307c39a550222a229b850f11 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 13 Jan 2022 17:34:53 +0800 Subject: [PATCH 25/76] anolis: net/smc: Multiple CQs per IB devices ANBZ: #1742 ANBZ: #264 This allows multiple CQs for one IB device, compared to one CQ now. During IB device setup, it would initialize ibdev->num_comp_vectors amount of send/recv CQs, and the corresponding tasklets, like queues for net devices. Every smc_link has their own send and recv CQs, which always assigning from the least used CQs of current IB device. Signed-off-by: Tony Lu Reviewed-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 139 +++++++++++++++++++++++++++++++---------------- net/smc/smc_ib.h | 6 +- net/smc/smc_wr.c | 18 ++++-- 3 files changed, 111 insertions(+), 52 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 251cc60b7c8c..e1b09307da06 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -630,6 +630,36 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, + bool is_send) +{ + struct smc_ib_cq *smcibcq, *cq; + int min, i; + + if (is_send) + smcibcq = smcibdev->smcibcq_send; + else + smcibcq = smcibdev->smcibcq_recv; + + cq = smcibcq; + min = cq->load; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibcq[i].load < min) { + cq = &smcibcq[i]; + min = cq->load; + } + } + + cq->load++; + return cq; +} + +static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) +{ + smcibcq->load--; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -653,8 +683,11 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) + if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); + smc_ib_put_cq(lnk->smcibcq_send); + smc_ib_put_cq(lnk->smcibcq_recv); + } lnk->roce_qp = NULL; lnk->smcibcq_send = NULL; lnk->smcibcq_recv = NULL; @@ -663,12 +696,16 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, + true); + struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, + false); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->ib_cq_send->ib_cq, - .recv_cq = lnk->smcibdev->ib_cq_recv->ib_cq, + .send_cq = smcibcq_send->ib_cq, + .recv_cq = smcibcq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -698,8 +735,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = lnk->smcibdev->ib_cq_send; - lnk->smcibcq_recv = lnk->smcibdev->ib_cq_recv; + lnk->smcibcq_send = smcibcq_send; + lnk->smcibcq_recv = smcibcq_recv; smc_wr_remember_qp_attr(lnk); } return rc; @@ -850,20 +887,26 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { - ib_destroy_cq(smcibdev->ib_cq_send->ib_cq); - kfree(smcibdev->ib_cq_send); - smcibdev->ib_cq_send = NULL; + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibdev->smcibcq_send[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); + + if (smcibdev->smcibcq_recv[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); + } - ib_destroy_cq(smcibdev->ib_cq_recv->ib_cq); - kfree(smcibdev->ib_cq_recv); - smcibdev->ib_cq_recv = NULL; + kfree(smcibdev->smcibcq_send); + kfree(smcibdev->smcibcq_recv); } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; - struct smc_ib_cq *smcibcq_send, *smcibcq_recv; int cqe_size_order, smc_order; + struct smc_ib_cq *smcibcq; + int i, num_cq_peer; long rc; mutex_lock(&smcibdev->mutex); @@ -875,49 +918,53 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibcq_send = kzalloc(sizeof(*smcibcq_send), GFP_KERNEL); - if (!smcibcq_send) { + num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq_peer = num_cq_peer; + smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_send) { rc = -ENOMEM; - goto out; - } - smcibcq_send->smcibdev = smcibdev; - smcibcq_send->is_send = 1; - cqattr.comp_vector = 0; - smcibcq_send->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq_send, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_send); - if (IS_ERR(smcibdev->ib_cq_send)) { - smcibdev->ib_cq_send = NULL; - goto out; + goto err; } - smcibdev->ib_cq_send = smcibcq_send; - - smcibcq_recv = kzalloc(sizeof(*smcibcq_recv), GFP_KERNEL); - if (!smcibcq_recv) { + smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_recv) { rc = -ENOMEM; - goto err_send; + goto err; } - smcibcq_recv->smcibdev = smcibdev; - cqattr.comp_vector = 1; - smcibcq_recv->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibcq_recv, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->ib_cq_recv); - if (IS_ERR(smcibdev->ib_cq_recv)) { - smcibdev->ib_cq_recv = NULL; - goto err_recv; + + /* initialize CQs */ + for (i = 0; i < num_cq_peer; i++) { + /* initialize send CQ */ + smcibcq = &smcibdev->smcibcq_send[i]; + smcibcq->smcibdev = smcibdev; + smcibcq->is_send = 1; + cqattr.comp_vector = i; + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; + + /* initialize recv CQ */ + smcibcq = &smcibdev->smcibcq_recv[i]; + smcibcq->smcibdev = smcibdev; + cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; } - smcibdev->ib_cq_recv = smcibcq_recv; smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; -err_recv: - kfree(smcibcq_recv); - ib_destroy_cq(smcibcq_send->ib_cq); -err_send: - kfree(smcibcq_send); +err: + smc_ib_cleanup_cq(smcibdev); out: mutex_unlock(&smcibdev->mutex); return rc; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 15b213f19c6e..456d59670031 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -37,6 +37,7 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ bool is_send; /* send for recv cq */ + int load; /* load of current cq */ }; struct smc_ib_device { /* ib-device infos for smc */ @@ -44,8 +45,9 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct smc_ib_cq *ib_cq_send; /* send completion queue */ - struct smc_ib_cq *ib_cq_recv; /* recv completion queue */ + int num_cq_peer; /* num of snd/rcv cq peer */ + struct smc_ib_cq *smcibcq_send; /* send cqs */ + struct smc_ib_cq *smcibcq_recv; /* recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index b30c23469704..937339fd1fdb 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -845,14 +845,24 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->ib_cq_recv->tasklet); - tasklet_kill(&smcibdev->ib_cq_send->tasklet); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); + tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); + } } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->ib_cq_recv->tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->ib_cq_send->tasklet, smc_wr_tx_tasklet_fn); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, + smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, + smc_wr_rx_tasklet_fn); + } } int smc_wr_create_link(struct smc_link *lnk) -- Gitee From 8d58c627e4cd34c4840fdd86c1859a5699ccee99 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Mon, 7 Mar 2022 13:16:28 +0800 Subject: [PATCH 26/76] anolis: net/smc: Introduce a sysctl to disable {a}symmetric link group ANBZ: #1742 When smc uses erdma as underlay implementation, smc link (rdma connection) is created according to five-tupe of tcp connection. However, the second smc link can't be created correctly if there is no another tcp connection. So we decide to disable {a}symmetric link group defaultly in erdma environment by a sysctl as a workaround. Signed-off-by: Wen Gu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 36 +++++++++++++++++++++--------------- net/smc/smc_core.c | 3 +++ net/smc/smc_llc.c | 9 +++++++++ net/smc/smc_sysctl.c | 10 ++++++++++ 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 5bcce341b011..f8a6c4ebb985 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -31,5 +31,6 @@ struct netns_smc { int sysctl_rmem; int sysctl_tcp2smc; int sysctl_allow_different_subnet; + int sysctl_disable_multiple_link; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 732bac33cdd4..db51d71710d8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -555,6 +555,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; + struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -601,20 +602,22 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; + if (!net->smc.sysctl_disable_multiple_link) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); return 0; } @@ -1801,6 +1804,7 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; + struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -1841,8 +1845,10 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); + if (!net->smc.sysctl_disable_multiple_link) { + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + } return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 10a7b330e626..8c6ad5c5fbec 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1676,6 +1676,9 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 34b1028d8d70..791b45329349 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1090,6 +1090,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) rc = -ENOMEM; goto out_reject; } + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out_reject; ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { @@ -1215,6 +1218,9 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out; ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) @@ -1460,6 +1466,9 @@ int smc_llc_srv_add_link(struct smc_link *link, rc = -ENOMEM; goto out; } + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out; /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index efadc6e5bacb..fd5c3935313f 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -88,6 +88,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "disable_multiple_link", + .data = &init_net.smc.sysctl_disable_multiple_link, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -118,6 +127,7 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; + net->smc.sysctl_disable_multiple_link = 1; return 0; err_reg: -- Gitee From 979dcbe4a41a97fa6cbcc16517f16763201e9a7f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 15 Mar 2022 15:52:29 +0800 Subject: [PATCH 27/76] anolis: net/smc: Introduce rtoken validity check before sending ANBZ: #1742 ANBZ: #264 The local peer might be still sending data when receiving remote peer requests for deleting rtoken. So the local peer might use an already deleted rkey for rdma write operation. In eRDMA scenario, this may cause a hung because eRDMA driver won't generate CQEs for sending with wrong rkey and cdc_pend_tx_wr won't reach zero anymore. So this patch tries to fix this by checking rtoken validity before rdma write operation. This won't cause data loss because at this moment, the remote peer must be SMC_CLOSE state and no longer want to receive any data. Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_tx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index e1831cfc0ae5..5e84bd07dfed 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -353,6 +353,12 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; + /* rtoken might be deleted if peer freed connection */ + if (!rdma_wr->rkey || + (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { + pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); + return -EINVAL; + } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); -- Gitee From 9366810ae17bdcc9169aa0f7f6eefdc309e69013 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 22 Mar 2022 17:27:04 +0800 Subject: [PATCH 28/76] anolis: net/smc: don't req_notify until all CQEs drained ANBZ: #1742 ANBZ: #264 When we are handling softirq workload, enable hardirq may again interrupt the current routine of softirq, and then try to raise softirq again. This only wastes CPU cycles and won't have any real gain. Since IB_CQ_REPORT_MISSED_EVENTS already make sure if ib_req_notify_cq() returns 0, it is safe to wait for the next event, with no need to poll the CQ again in this case. This patch disables hardirq during the processing of softirq, and re-arm the CQ after softirq is done. Somehow like NAPI. Co-developed-by: Guangguan Wang Signed-off-by: Guangguan Wang Signed-off-by: Dust Li Signed-off-by: Wen Gu Acked-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 49 +++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 937339fd1fdb..5a5a2f4ea9d0 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -138,25 +138,28 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i = 0, rc; - int polled = 0; + int i, rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) - break; for (i = 0; i < rc; i++) smc_wr_tx_process_cqe(&wc[i]); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. + */ + break; } while (rc > 0); - if (polled == 1) + + /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, + * then it is safe to wait for the next event; else we must poll the + * CQ again to make sure we won't miss any event. + */ + if (ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; } @@ -488,24 +491,28 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int polled = 0; int rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_SOLICITED_MASK - | IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) + if (rc > 0) + smc_wr_rx_process_cqes(&wc[0], rc); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. + */ break; - smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - if (polled == 1) + + /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, + * then it is safe to wait for the next event; else we must poll the + * CQ again to make sure we won't miss any event. + */ + if (ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; } -- Gitee From e5113d36b6b974ad5ecccc630195dc46149a1f26 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 26 Apr 2022 21:25:10 +0800 Subject: [PATCH 29/76] anolis: net/smc: Fix NULL sk pointer when access clcsock ANBZ: #1742 This patch fixes NULL sk pointer in clcsock. Signed-off-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 5 ++++- net/smc/smc_clc.c | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index db51d71710d8..b2d50ae904b8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1858,8 +1858,11 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + mutex_lock(&new_smc->clcsock_release_lock); + if (new_smc->clcsock && new_smc->clcsock->sk && + tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); + mutex_unlock(&new_smc->clcsock_release_lock); if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ba20049ef6ce..9a75119b3437 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,7 +795,13 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock || !smc->clcsock->sk) { + mutex_unlock(&smc->clcsock_release_lock); + return -EPROTO; + } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + mutex_unlock(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; -- Gitee From a0375d991a7dc17f54075cdf775608d4e5d0599d Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Apr 2022 15:24:43 +0800 Subject: [PATCH 30/76] anolis: net/smc: Avoid clcsock access panic ANBZ: #1742 This patch is a set of the workaround for clcsock access panic. There are two kinds of invalid access of clcsock. 1) Access smc->clcsock when smc->clcsock is reset to NULL; 2) Access smc->clcsock->sk when sock_release(clcsock); In upstream implementation, only 1) happens, and it is fixed by c0bf3d8a943b ("net/smc: Transitional solution for clcsock race issue"). In anolis implementation, 1) and 2) are both reproduced. They are mainly triggered by c5e5a9f9c5d8 ("net/smc: Keep first contact clcsock"). In anolis smc implementation, The first contact's clcsock is saved in link struct and may be released during smc link clear. After that, if smc->clcsock is accessed, a NULL pointer panic will happen. This patch provides a workaround for these. To eradicate such issues, We may need to avoid using first contact's clcsock as erdma link. Fixes: c0bf3d8a943b ("net/smc: Transitional solution for clcsock race issue"). Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b2d50ae904b8..efcc8446d98a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1630,6 +1630,11 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, break; } + if (!smc->clcsock || + (smc->clcsock && !smc->clcsock->sk)) { + rc = -EBADF; + goto out; + } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { @@ -1693,10 +1698,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); + mutex_lock(&lsmc->clcsock_release_lock); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); + mutex_unlock(&lsmc->clcsock_release_lock); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; -- Gitee From 88c04c0a162dca9e4ffc71abdd2ef5f53211f09b Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 6 May 2022 10:19:04 +0800 Subject: [PATCH 31/76] anolis: net/smc: do not send msg in receiving process when tx is not blocked. ANBZ: #1742 As user send thread(normal send path) and tx completion tasklet(corked send path) will send msgs, there is no need to send msg in recv completion tasklet when RMB's ci updated and smc_tx_prepared_sends, which may slower the recv performance as recv completion tasklet is shared by multiple connections, but write_blocked condition. In netty benchamrk, show 28% improvement in throughput: Before: throughput cpu sys usr thread-480 connect-48 len-8: 1653807.614 124.755 69.0489 55.7061 After: throughput cpu sys usr thread-480 connect-48 len-8: 2113879.617 132.117 67.9467 64.1707 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 410134dccbf9..482e60753216 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -363,7 +363,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if ((diff_cons && smc_tx_prepared_sends(conn)) || + if ((diff_cons && smc_tx_prepared_sends(conn) && + conn->local_tx_ctrl.prod_flags.write_blocked) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || conn->local_rx_ctrl.prod_flags.urg_data_pending) { if (!sock_owned_by_user(&smc->sk)) -- Gitee From 8a1d3b9c9a2fcdf18378c8237f765127a72a0544 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 6 May 2022 10:45:52 +0800 Subject: [PATCH 32/76] anolis: net/smc: compress frequency of credits announcement by cdc msg ANBZ: #1742 When in heavy traffic, credits token by cdc msg maybe few and wakeup frequently when credits update in recv side, which may use more cpu. Set announcement wartermark, which is 10% of local rq credits, can compress the announcement frequecy, and the credits taken by cdc msg is more than 10% of local rq credits, reduce the wakeup frequency in the recv side. In netty benchamrk, show 28% improvement in throughput: Before: throughput cpu sys usr thread-480 connect-48 len-8: 1653807.614 124.755 69.0489 55.7061 After: throughput cpu sys usr thread-480 connect-48 len-8: 2113879.617 132.117 67.9467 64.1707 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_cdc.c | 3 ++- net/smc/smc_core.h | 1 + net/smc/smc_wr.c | 5 +++++ net/smc/smc_wr.h | 11 +++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 482e60753216..25b836df9f50 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -121,7 +121,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); + if (smc_wr_rx_credits_need_announce_frequent(link)) + saved_credits = (u8)smc_wr_rx_get_credits(link); cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 9130a6f87264..a77991672701 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -138,6 +138,7 @@ struct smc_link { u8 credits_enable; /* credits enable flag, set when negotiation */ u8 local_cr_watermark_high; /* local rq credits watermark */ u8 peer_cr_watermark_low; /* peer rq credits watermark */ + u8 credits_update_limit; /* credits update limit for cdc msg */ struct work_struct credits_announce_work; /* work for credits announcement */ unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 5a5a2f4ea9d0..2971b3a73bf2 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -923,6 +923,11 @@ int smc_wr_create_link(struct smc_link *lnk) lnk->flags = 0; lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); lnk->peer_cr_watermark_low = 0; + + /* if credits accumlated less than 10% of wr_rx_cnt(at least 5), + * will not be announced by cdc msg. + */ + lnk->credits_update_limit = max(lnk->wr_rx_cnt / 10, 5U); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 8cf276215c91..5b671065afdc 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -133,6 +133,17 @@ static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; } +static inline int smc_wr_rx_credits_need_announce_frequent(struct smc_link *link) +{ + /* announce when local rq credits accumulated more than credits_update_limit, or + * peer rq credits is empty. As peer credits empty and local credits is less than + * credits_update_limit, may results in credits deadlock. + */ + return link->credits_enable && + (atomic_read(&link->local_rq_credits) >= link->credits_update_limit || + !atomic_read(&link->peer_rq_credits)); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { -- Gitee From 9973e0bcd33d281c194a90991cb0523b2293df29 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 21 Mar 2022 21:10:41 +0800 Subject: [PATCH 33/76] anolis: net/smc: Release lock before waiting for CLC accept message ANBZ: #1742 Applications use to call setsockopt() after connect(), which requires under the sock lock. Holding the sock lock during the CLC handshake may cause applications have to wait until CLC accept message got ready. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index efcc8446d98a..884762786f71 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1128,9 +1128,13 @@ static int smc_connect_clc(struct smc_sock *smc, rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; + + release_sock(&smc->sk); /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, - SMC_CLC_ACCEPT, CLC_WAIT_TIME); + rc = smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); + lock_sock(&smc->sk); + return rc; } void smc_fill_gid_list(struct smc_link_group *lgr, -- Gitee From 04f5cf906b67e9e4de6d8863d12c5499cb943ffe Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 5 May 2022 12:57:42 +0800 Subject: [PATCH 34/76] anolis: net/smc: Disable confirm rkey message exchange when only one link exists ANBZ: #1742 If there is only one link between the two sides of communication, it is not necessary to perform confirm RKey message exchange. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 884762786f71..43f489f93021 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -522,7 +522,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; - int i, rc = 0; + int i, lnk = 0, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) @@ -537,14 +537,20 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; + /* available link count inc */ + lnk++; } - /* exchange confirm_rkey msg with peer */ - rc = smc_llc_do_confirm_rkey(link, rmb_desc); - if (rc) { - rc = -EFAULT; - goto out; + /* do not exchange confirm_rkey msg since there are only one link */ + if (lnk > 1) { + /* exchange confirm_rkey msg with peer */ + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; + } } + rmb_desc->is_conf_rkey = true; out: mutex_unlock(&lgr->llc_conf_mutex); -- Gitee From 93b439da14999db1714984cfb9f21531feea5ddb Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 31 Mar 2022 13:17:28 +0800 Subject: [PATCH 35/76] anolis: net/smc: Avoid syscall block by async smc_conn_free ANBZ: #1742 smc_conn_free() will wait for rkey delete message, which will block the application syscall. Signed-off-by: D. Wythe Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 35 ++++++++++++++++++++++++++++------- net/smc/smc.h | 1 + 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 43f489f93021..e2ca4c0da7c5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -299,13 +299,9 @@ static int __smc_release(struct smc_sock *smc) sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { - if (smc->clcsock) { - release_sock(sk); - smc_clcsock_release(smc); - lock_sock(sk); - } - if (!smc->use_fallback) - smc_conn_free(&smc->conn); + sock_hold(sk); + if (!queue_work(smc_hs_wq, &smc->free_work)) + sock_put(sk); } return rc; @@ -367,6 +363,30 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } +static void smc_free_work(struct work_struct *work) +{ + struct sock *sk; + struct smc_sock *smc = container_of(work, struct smc_sock, + free_work); + + sk = &smc->sk; + + lock_sock(sk); + if (sk->sk_state == SMC_CLOSED) { + if (smc->clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } + + if (!smc->use_fallback) + smc_conn_free(&smc->conn); + } + release_sock(sk); + + sock_put(sk); /* before queue */ +} + static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { @@ -387,6 +407,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_WORK(&smc->free_work, smc_free_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); diff --git a/net/smc/smc.h b/net/smc/smc.h index 9d73fc5fdbc2..86947bec41d4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -256,6 +256,7 @@ struct smc_sock { /* smc sock container */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ + struct work_struct free_work; /* free smc conn */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ -- Gitee From 26a235180da4519c68a91be0044b34adafe5d78c Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 10:47:50 +0800 Subject: [PATCH 36/76] anolis: net/smc: move wc loop out of smc_wr_rx_process_cqes ANBZ: #1742 move wc loop out of smc_wr_rx_process_cqes to align the behaviour of smc_wr_tx_process_cqe. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 52 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 2971b3a73bf2..4df29a0fafd2 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -454,36 +454,32 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) } } -static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) +static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) { - struct smc_link *link; - int i; + struct smc_link *link = wc->qp->qp_context; - for (i = 0; i < num; i++) { - link = wc[i].qp->qp_context; - if (wc[i].status == IB_WC_SUCCESS) { - link->wr_rx_tstamp = jiffies; - smc_wr_rx_demultiplex(&wc[i]); + if (wc->status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(wc); + smc_wr_rx_post(link); /* refill WR RX */ + } else { + /* handle status errors */ + switch (wc->status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); + break; + default: smc_wr_rx_post(link); /* refill WR RX */ - } else { - /* handle status errors */ - switch (wc[i].status) { - case IB_WC_RETRY_EXC_ERR: - case IB_WC_RNR_RETRY_EXC_ERR: - case IB_WC_WR_FLUSH_ERR: - smcr_link_down_cond_sched(link); - break; - default: - smc_wr_rx_post(link); /* refill WR RX */ - break; - } + break; } + } - if (smc_wr_rx_credits_need_announce(link) && - !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { - set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); - schedule_work(&link->credits_announce_work); - } + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); } } @@ -491,14 +487,14 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int rc; + int i, rc; again: do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - if (rc > 0) - smc_wr_rx_process_cqes(&wc[0], rc); + for (i = 0; i < rc; i++) + smc_wr_rx_process_cqe(&wc[i]); if (rc < SMC_WR_MAX_POLL_CQE) /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been * drained, no need to poll again. -- Gitee From 56162ee760a70ef17f139ba92fbbf3cd7d7cbda9 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 11:37:48 +0800 Subject: [PATCH 37/76] anolis: net/smc: combine send cq and recv cq into one cq ANBZ: #1742 SMC-R uses two CQs per link, one for SQ called SCQ and the other for RQ called RCQ. RDMA supports SCQ and RCQ are the same CQ. In RDMA, more CQs means more interrupts as less cqe polled out echo poll_cq. This patch combines send cq and recv cq into one cq. Because of halving the number of CQs, fewer interrupts are generated and hi usage is lower. Nginx benchmark shows 5.8% improvement in throughput: Server test command: smc_run nginx Client test command: smc_run /opt/wrk/wrk http://ip:port -t 32 -c 992 -d 30 --latency Before: Requests/sec: 1927316.76 Transfer/sec: 295.92MB After: Requests/sec: 2039360.72 Transfer/sec: 313.13MB Redis benchmark shows 8% improvement in cpu usage: Server test command: smc_run ./redis-server --save "" --appendonly no --protected-mode no sar 1 10 (Cpu usage collect command) Client test command: smc_run ./redis-benchmark -h -q -t set -P 1 --threads 7\ -n 25000000 -c 100 -d 10 Before: CPU %user %nice %system %iowait %steal %idle all 0.90 0.00 5.44 0.00 0.00 93.66 After: CPU %user %nice %system %iowait %steal %idle all 0.87 0.00 5.00 0.00 0.00 94.13 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_core.h | 3 +- net/smc/smc_ib.c | 83 +++++++++++--------------------------- net/smc/smc_ib.h | 6 +-- net/smc/smc_wr.c | 99 +++++++++++++++++----------------------------- net/smc/smc_wr.h | 3 +- 5 files changed, 64 insertions(+), 130 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index a77991672701..1bcd099a3a97 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -94,8 +94,7 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq_recv; /* cq for recv */ - struct smc_ib_cq *smcibcq_send; /* cq for send */ + struct smc_ib_cq *smcibcq; /* cq for recv & send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index e1b09307da06..6d54861b2a3f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, + rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, + rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -630,21 +630,16 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, - bool is_send) +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev) { struct smc_ib_cq *smcibcq, *cq; int min, i; - if (is_send) - smcibcq = smcibdev->smcibcq_send; - else - smcibcq = smcibdev->smcibcq_recv; - + smcibcq = smcibdev->smcibcq; cq = smcibcq; min = cq->load; - for (i = 0; i < smcibdev->num_cq_peer; i++) { + for (i = 0; i < smcibdev->num_cq; i++) { if (smcibcq[i].load < min) { cq = &smcibcq[i]; min = cq->load; @@ -685,27 +680,22 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) { if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq_send); - smc_ib_put_cq(lnk->smcibcq_recv); + smc_ib_put_cq(lnk->smcibcq); } lnk->roce_qp = NULL; - lnk->smcibcq_send = NULL; - lnk->smcibcq_recv = NULL; + lnk->smcibcq = NULL; } /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, - true); - struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, - false); + struct smc_ib_cq *smcibcq = smc_ib_get_least_used_cq(lnk->smcibdev); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq_send->ib_cq, - .recv_cq = smcibcq_recv->ib_cq, + .send_cq = smcibcq->ib_cq, + .recv_cq = smcibcq->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -735,8 +725,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = smcibcq_send; - lnk->smcibcq_recv = smcibcq_recv; + lnk->smcibcq = smcibcq; smc_wr_remember_qp_attr(lnk); } return rc; @@ -889,16 +878,12 @@ static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibdev->smcibcq_send[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); - - if (smcibdev->smcibcq_recv[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); + for (i = 0; i < smcibdev->num_cq; i++) { + if (smcibdev->smcibcq[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); } - kfree(smcibdev->smcibcq_send); - kfree(smcibdev->smcibcq_recv); + kfree(smcibdev->smcibcq); } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -906,7 +891,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; int cqe_size_order, smc_order; struct smc_ib_cq *smcibcq; - int i, num_cq_peer; + int i, num_cq; long rc; mutex_lock(&smcibdev->mutex); @@ -918,42 +903,22 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq_peer = num_cq_peer; - smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_send) { - rc = -ENOMEM; - goto err; - } - smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_recv) { + num_cq = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq = num_cq; + smcibdev->smcibcq = kcalloc(num_cq, sizeof(*smcibcq), GFP_KERNEL); + if (!smcibdev->smcibcq) { rc = -ENOMEM; goto err; } /* initialize CQs */ - for (i = 0; i < num_cq_peer; i++) { - /* initialize send CQ */ - smcibcq = &smcibdev->smcibcq_send[i]; + for (i = 0; i < num_cq; i++) { + smcibcq = &smcibdev->smcibcq[i]; smcibcq->smcibdev = smcibdev; - smcibcq->is_send = 1; cqattr.comp_vector = i; smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) - goto err; - - /* initialize recv CQ */ - smcibcq = &smcibdev->smcibcq_recv[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, + smc_wr_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); if (IS_ERR(smcibcq->ib_cq)) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 456d59670031..62f4e5619147 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -36,7 +36,6 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct smc_ib_device *smcibdev; /* parent ib device */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ - bool is_send; /* send for recv cq */ int load; /* load of current cq */ }; @@ -45,9 +44,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq_peer; /* num of snd/rcv cq peer */ - struct smc_ib_cq *smcibcq_send; /* send cqs */ - struct smc_ib_cq *smcibcq_recv; /* recv cqs */ + int num_cq; /* num of snd/rcv cq */ + struct smc_ib_cq *smcibcq; /* send & recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 4df29a0fafd2..cb8bd0e04cb4 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -134,42 +134,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) wake_up(&link->wr_tx_wait); } -static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) -{ - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); - struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc; - -again: - do { - memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) - smc_wr_tx_process_cqe(&wc[i]); - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; - } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) - goto again; -} - -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) -{ - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - - tasklet_schedule(&smcibcq->tasklet); -} - /*---------------------------- request submission ---------------------------*/ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) @@ -330,7 +294,7 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { @@ -374,7 +338,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq_send->ib_cq, + ib_req_notify_cq(link->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -483,7 +447,19 @@ static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) } } -static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) +int smc_wr_rx_post_init(struct smc_link *link) +{ + u32 i; + int rc = 0; + + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); + return rc; +} + +static void smc_wr_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; @@ -493,8 +469,21 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) - smc_wr_rx_process_cqe(&wc[i]); + for (i = 0; i < rc; i++) { + switch (wc[i].opcode) { + case IB_WC_REG_MR: + case IB_WC_SEND: + smc_wr_tx_process_cqe(&wc[i]); + break; + case IB_WC_RECV: + smc_wr_rx_process_cqe(&wc[i]); + break; + default: + pr_warn("smc: unexpected wc opcode %d, status %d, wr_id %llu.\n", + wc[i].opcode, wc[i].status, wc[i].wr_id); + break; + } + } if (rc < SMC_WR_MAX_POLL_CQE) /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been * drained, no need to poll again. @@ -512,25 +501,13 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) goto again; } -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) { struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; tasklet_schedule(&smcibcq->tasklet); } -int smc_wr_rx_post_init(struct smc_link *link) -{ - u32 i; - int rc = 0; - - for (i = 0; i < link->wr_rx_cnt; i++) - rc = smc_wr_rx_post(link); - // credits have already been announced to peer - atomic_set(&link->local_rq_credits, 0); - return rc; -} - /***************************** init, exit, misc ******************************/ void smc_wr_remember_qp_attr(struct smc_link *lnk) @@ -850,21 +827,17 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); - tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); - } + for (i = 0; i < smcibdev->num_cq; i++) + tasklet_kill(&smcibdev->smcibcq[i].tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, - smc_wr_tx_tasklet_fn); - tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, - smc_wr_rx_tasklet_fn); + for (i = 0; i < smcibdev->num_cq; i++) { + tasklet_setup(&smcibdev->smcibcq[i].tasklet, + smc_wr_tasklet_fn); } } diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 5b671065afdc..ce338e1ca6c2 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -187,12 +187,11 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); #endif /* SMC_WR_H */ -- Gitee From fe1fdc48a1db43a2da98cfea7e99eab1b87a96f6 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 15:24:44 +0800 Subject: [PATCH 38/76] anolis: net/smc: remove redundant ib_req_notify_cq ANBZ: #1742 Solicited flag is only used by RCQ. As SCQ and RCQ are combined into one CQ, we can not notify cq with solicited flag. And immediately after the solicited notify cq, another notify with next complete flag is performed, the state machine of CQ will also immediately switch from the Arm_Sol state to the Armed state, which is the same as the result of direct notify with next complete flag. So the code of notify CQ with solicited is redundant and meaningless. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 6d54861b2a3f..485041bfd0d4 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,11 +131,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, - IB_CQ_SOLICITED_MASK); - if (rc) - goto out; - rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) -- Gitee From 56561b5a4762fbb3a7daae1819c022bc1d15fde7 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 9 Jun 2022 09:48:43 +0800 Subject: [PATCH 39/76] anolis: net/smc: poll_cq one more time if the polled cqe is less than SMC_WR_MAX_POLL_CQE ANBZ: #1742 notify cq with IB_CQ_REPORT_MISSED_EVENTS flag, rdma driver will return positive value if the cq is not empty, and will always arm cq regardless of whether the cq is empty or not. Once arm cq when cq is not empty, cq interrupt will be generated event though cq has been drained out after arm. Thus, if new cqe is generated between cq drained out and arm cq, SMC-R will get positive value when ib_req_notify_cq and goto poll cq and drain cqe again, and cq interrupt is useless in such condition. In nginx + wrk benchmark, about 10% of the cq interrupts are useless cq interrupts. Poll cq one more time if the polled cqe is less than SMC_WR_MAX_POLL_CQE can reduce the useless cq interrupts from 10% to 1%. Nginx benchmark shows 7.5% improvement in throughput: Server test command: smc_run nginx Client test command: smc_run /opt/wrk/wrk http://ip:port -t 32 -c 992 -d 30 --latency Before: Requests/sec: 1983511.11 Transfer/sec: 304.55MB After: Requests/sec: 2133148.49 Transfer/sec: 327.53MB Fixes: f49d6eda516f (net/smc: don't req_notify until all CQEs drained) Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index cb8bd0e04cb4..a246e3bb9a4c 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -484,11 +484,6 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) break; } } - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; } while (rc > 0); /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, -- Gitee From 766d103844884931d934f8bd3385c8f0a79c84e7 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 6 Jun 2022 10:52:43 +0800 Subject: [PATCH 40/76] anolis: net/smc: introduce 1RTT to SMC-R ANBZ: #1742 SMC-R 1rtt is currently an internal version. In order to be compatible with subsequent community versions, sysctl is used here, and it is turned on by default. We have noticed that single network interface card is mainstream on the cloud, dues to the advantages of cloud deployment costs and the cloud's own disaster recovery support. On the other hand, the emergence of RoCE LAG technology makes us no longer need to deal with multiple RDMA network interface cards by ourselves, just like NIC bonding does. In Alibaba, Roce LAG is widely used for RDMA. In that case, SMC-R have only one single link, if so, the RKEY LLC messages that to perform information exchange in all links are no longer needed, the SMC Proposal & accept has already complete the exchange of all information needed. So we think that we can remove the RKEY exchange in that case, which will save us 2-RTT over IB. We call it as SMC-R 2-RTT. We can use TCP fast open, carry the SMC proposal data by TCP SYN message, reduce the time that the SMC waits for the TCP connection to be established. This will save us another 1-RTT over IP. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 3 +++ net/smc/af_smc.c | 35 +++++++++++++++++++++++++++++++---- net/smc/smc.h | 3 +++ net/smc/smc_sysctl.c | 21 +++++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index f8a6c4ebb985..3d37a316abde 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -32,5 +32,8 @@ struct netns_smc { int sysctl_tcp2smc; int sysctl_allow_different_subnet; int sysctl_disable_multiple_link; + /* allow simplify rkey exchange when single link */ + unsigned int sysctl_simplify_rkey_exhcange; + unsigned int sysctl_smc_fastopen; }; #endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e2ca4c0da7c5..caff3ef7134c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -69,6 +69,15 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +static inline int smc_clcsock_enable_fastopen(struct smc_sock *smc, int is_server) +{ + int val = 1; + + return smc->clcsock->ops->setsockopt(smc->clcsock, SOL_TCP, + is_server ? TCP_FASTOPEN : TCP_FASTOPEN_CONNECT, + KERNEL_SOCKPTR(&val), sizeof(val)); +} + int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -418,6 +427,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); + /* default behavior from every net namespace */ + smc->simplify_rkey_exhcange = net->smc.sysctl_simplify_rkey_exhcange; + smc->smc_fastopen = net->smc.sysctl_smc_fastopen; + return sk; } @@ -539,9 +552,10 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, } /* register the new rmb on all links */ -static int smcr_lgr_reg_rmbs(struct smc_link *link, +static int smcr_lgr_reg_rmbs(struct smc_sock *smc, struct smc_buf_desc *rmb_desc) { + struct smc_link *link = smc->conn.lnk; struct smc_link_group *lgr = link->lgr; int i, lnk = 0, rc = 0; @@ -563,7 +577,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, } /* do not exchange confirm_rkey msg since there are only one link */ - if (lnk > 1) { + if (lnk > 1 || !smc->simplify_rkey_exhcange) { /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { @@ -1311,7 +1325,7 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } - if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { + if (smcr_lgr_reg_rmbs(smc, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } @@ -1576,6 +1590,11 @@ static void smc_connect_work(struct work_struct *work) if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; + + if (smc->smc_fastopen && + inet_sk(smc->clcsock->sk)->defer_connect) + goto defer_connect; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; @@ -1588,6 +1607,7 @@ static void smc_connect_work(struct work_struct *work) rc = 0; } release_sock(smc->clcsock->sk); +defer_connect: lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; @@ -1672,6 +1692,10 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, rc = -EALREADY; goto out; } + + if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is_server */ 0)) + smc->smc_fastopen = 0; /* rollback when setsockopt failed */ + rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; @@ -2223,7 +2247,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) conn->sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } - if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) + if (smcr_lgr_reg_rmbs(new_smc, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; } @@ -2601,6 +2625,9 @@ static int smc_listen(struct socket *sock, int backlog) if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is server */ 1)) + smc->smc_fastopen = 0; /* rollback when setsockopt failed */ + rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); diff --git a/net/smc/smc.h b/net/smc/smc.h index 86947bec41d4..52141ce3f67b 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -260,6 +260,9 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ + bool simplify_rkey_exhcange; /* simplify rkey exchange */ + /* enable SMC-R handshake proposal via tcp fastopen */ + bool smc_fastopen; bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index fd5c3935313f..23abd9188c34 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -97,6 +97,24 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "simplify_rkey_exhcange", + .data = &init_net.smc.sysctl_simplify_rkey_exhcange, + .maxlen = sizeof(init_net.smc.sysctl_simplify_rkey_exhcange), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "fastopen", + .data = &init_net.smc.sysctl_smc_fastopen, + .maxlen = sizeof(init_net.smc.sysctl_smc_fastopen), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -128,6 +146,9 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; net->smc.sysctl_disable_multiple_link = 1; + /* default on */ + net->smc.sysctl_simplify_rkey_exhcange = 1; + net->smc.sysctl_smc_fastopen = 1; return 0; err_reg: -- Gitee From 56bc6edd8bfa8432d6f29b9f03f283d5d956074b Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:30:53 +0800 Subject: [PATCH 41/76] anolis: net/smc: clear ib_cq errno when ib_create_cq failed ANBZ: #1742 When ib_create_cq failed, ib_create_cq will return errno stored in ib_cq ptr, and then smc_ib_cleanup_cq will be called to destroy cq. The ib_cq ptr, where stores errno will pass to ib_destroy_cq, which will cause kernel crash. Clear errno in ib_cq ptr to NULL, when ib_create_cq failed. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 485041bfd0d4..f8234423d70a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -916,8 +916,11 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_wr_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) + if (IS_ERR(smcibcq->ib_cq)) { + smcibcq->ib_cq = NULL; goto err; + } + } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; -- Gitee From d679fe0347dff3d5b20844af3499d8e1f4bb3a6e Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:31:02 +0800 Subject: [PATCH 42/76] anolis: net/smc: remove redundant ib_req_notify_cq ANBZ: #1742 ib_req_notify_cq after ib_cq created and every cq event processed. Other ib_req_notify_cq is redundant. This patch also improves connecting performance, as ib_req_notify_cq in connecting process and in smc_wr_tasklet_fn has lock competition. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_ib.c | 7 +++---- net/smc/smc_wr.c | 4 ---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f8234423d70a..6c2bc2849058 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,10 +131,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc) - goto out; rc = smc_wr_rx_post_init(lnk); if (rc) @@ -921,6 +917,9 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto err; } + rc = ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP); + if (rc) + goto err; } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index a246e3bb9a4c..1866d40baeb0 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -294,8 +294,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); @@ -338,8 +336,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; link->wr_reg.mr = mr; -- Gitee From 6462ab5b92f3f50395b410a835bc3479cc3e27bb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Mon, 27 Jun 2022 21:31:10 +0800 Subject: [PATCH 43/76] anolis: net/smc: remove solicited flag for wr send ANBZ: #1742 In smc, CQ state machine will not turn to arm_sol, solicited flag is useless and harmful for rdma dim. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_wr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 1866d40baeb0..ab183684c954 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -557,8 +557,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; - lnk->wr_tx_ibs[i].send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED; if (send_inline) lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; @@ -578,8 +577,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; lnk->wr_tx_v2_ib->num_sge = 1; lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; - lnk->wr_tx_v2_ib->send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_v2_ib->send_flags = IB_SEND_SIGNALED; } /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. -- Gitee From e79edb8b4c8b8beabe4383db6f759e18d2871c8c Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 21 Jun 2022 11:18:55 +0800 Subject: [PATCH 44/76] anolis: net/smc: Fix potential leaks on queued_smc_hs ANBZ: #1742 The following potential scenarios could cause leaks: atomic_inc(&lsmc->queued_smc_hs); ... smc_listen_out_err __smc_lgr_terminate smc_conn_kill switch sk.sk_state case SMC_INIT: break; sock_set_flag(sk, SOCK_DEAD); smc_close_passive_work old_state = SMC_INIT if (sk_state == SMC_INIT) sk_state = SMC_APPCLOSEWAIT1; sk_state=SMC_CLOSED old_state != sk_state; sk_state == SMC_CLOSED; sock_flag(sk, SOCK_DEAD) smc_clcsock_release() clcsock = NULL if (clcsock ...) atomic_dec(&lsmc->queued_smc_hs) Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 11 ++++++----- net/smc/smc.h | 5 +++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index caff3ef7134c..62dd2b2f4417 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1920,11 +1920,8 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - mutex_lock(&new_smc->clcsock_release_lock); - if (new_smc->clcsock && new_smc->clcsock->sk && - tcp_sk(new_smc->clcsock->sk)->syn_smc) + if (new_smc->smc_negotiated) atomic_dec(&lsmc->queued_smc_hs); - mutex_unlock(&new_smc->clcsock_release_lock); if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); @@ -2539,8 +2536,12 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) { + new_smc->smc_negotiated = 1; atomic_inc(&lsmc->queued_smc_hs); + /* memory barrier */ + smp_mb__after_atomic(); + } new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; diff --git a/net/smc/smc.h b/net/smc/smc.h index 52141ce3f67b..ffbe3ecc2fb4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -279,6 +279,11 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 smc_negotiated : 1; + /* whether the smc_sock + * was successfully negotiated + * via TCP options. + */ u8 connect_nonblock : 1; /* non-blocking connect in * flight -- Gitee From 611451163b5d6f81265e56e7233f7e3e24304797 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 29 Jun 2022 20:38:08 +0800 Subject: [PATCH 45/76] anolis: net/smc: Use diff TCP EXPR MAGIC to avoid network middleware do simple echo ANBZ: #1742 We found some network middleware will echo unknows TCP options, which will confuse SMC client. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- include/net/netns/smc.h | 2 ++ include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 21 ++++++++++++++++----- net/ipv4/tcp_output.c | 14 +++++++++++++- net/smc/smc_sysctl.c | 11 +++++++++++ 5 files changed, 44 insertions(+), 6 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 3d37a316abde..8124f59f856e 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -35,5 +35,7 @@ struct netns_smc { /* allow simplify rkey exchange when single link */ unsigned int sysctl_simplify_rkey_exhcange; unsigned int sysctl_smc_fastopen; + /* use diff TCP experiment magic code */ + unsigned int sysctl_smc_experiments; }; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e32e3fb58e1..011038a36a4b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -195,6 +195,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 +/* "SMCO" in EBCDIC encoding */ +#define TCPOPT_SMC_OK_MAGIC 0xE2D4C3D6 /* * TCP option lengths diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a8d53d6540b..da8bd811dc66 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3912,15 +3912,26 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, + const struct net *net, int opsize) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && - opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { - opt_rx->smc_ok = 1; - return true; + opsize >= TCPOLEN_EXP_SMC_BASE) { + /* syn ack */ + if (th->ack && net->smc.sysctl_smc_experiments) { + if (get_unaligned_be32(ptr) == TCPOPT_SMC_OK_MAGIC) { + opt_rx->smc_ok = 1; + return true; + } + return false; + } + /* syn only */ + if (get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { + opt_rx->smc_ok = 1; + return true; + } } } #endif @@ -4083,7 +4094,7 @@ void tcp_parse_options(const struct net *net, break; } - if (smc_parse_options(th, opt_rx, ptr, opsize)) + if (smc_parse_options(th, opt_rx, ptr, net, opsize)) break; opt_rx->saw_unknown = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 97056f4c0bfd..77776ec6ad4b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -417,6 +417,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) #define OPTION_MPTCP (1 << 10) +#define OPTION_SMC_OK BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -428,6 +429,12 @@ static void smc_options_write(__be32 *ptr, u16 *options) (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } else if (OPTION_SMC_OK & *options) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_OK_MAGIC); } } #endif @@ -727,10 +734,15 @@ static void smc_set_option_cond(const struct tcp_sock *tp, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) + const struct sock *sk; + + sk = &tp->inet_conn.icsk_inet.sk; + if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; + opts->options |= sock_net(sk)->smc.sysctl_smc_experiments ? + OPTION_SMC_OK : OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 23abd9188c34..aaba83d0c940 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -115,6 +115,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "sysctl_smc_experiments", + .data = &init_net.smc.sysctl_smc_experiments, + .maxlen = sizeof(init_net.smc.sysctl_smc_experiments), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -149,6 +158,8 @@ int __net_init smc_sysctl_net_init(struct net *net) /* default on */ net->smc.sysctl_simplify_rkey_exhcange = 1; net->smc.sysctl_smc_fastopen = 1; + /* default off */ + net->smc.sysctl_smc_experiments = 0; return 0; err_reg: -- Gitee From 7578fb4cf8c713fd54cefdc0abdd67c36943886a Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:04:24 +0800 Subject: [PATCH 46/76] anolis: net/smc: Change listen wq to unbound highpri wq ANBZ: #1742 Change listen wq to unbound and highpri wq. Signed-off-by: D. Wythe Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 62dd2b2f4417..57d634535311 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3488,7 +3488,7 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!smc_tcp_ls_wq) goto out_pnet; -- Gitee From 3837a83ab15d717faa85ce110c030aa58462d597 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:04:45 +0800 Subject: [PATCH 47/76] anolis: net/smc: remove useless path ANBZ: #1742 Shorten the fallback processing path Signed-off-by: D. Wythe Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 57d634535311..02e3bc3834a7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2421,16 +2421,6 @@ static void smc_listen_work(struct work_struct *work) return; } - /* check if peer is smc capable */ - if (!tcp_sk(newclcsock->sk)->syn_smc) { - rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - if (rc) - smc_listen_out_err(new_smc); - else - smc_listen_out_connected(new_smc); - return; - } - /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ @@ -2536,13 +2526,6 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) { - new_smc->smc_negotiated = 1; - atomic_inc(&lsmc->queued_smc_hs); - /* memory barrier */ - smp_mb__after_atomic(); - } - new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2551,9 +2534,26 @@ static void smc_tcp_listen_work(struct work_struct *work) smc_copy_sock_settings_to_smc(new_smc); new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; - sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) - sock_put(&new_smc->sk); + + /* check if peer is smc capable */ + if (!tcp_sk(new_smc->clcsock->sk)->syn_smc) { + release_sock(lsk); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); + lock_sock(lsk); + } else { + new_smc->smc_negotiated = 1; + atomic_inc(&lsmc->queued_smc_hs); + /* memory barrier */ + smp_mb__after_atomic(); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) + sock_put(&new_smc->sk); + } } out: -- Gitee From af79f7d9d15f2b6b484a048fe5ec312e2b7530ca Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 21 Jul 2022 11:05:04 +0800 Subject: [PATCH 48/76] anolis: net/smc: queue free_work to smc_close_wq instead of smc_hs_wq ANBZ: #1742 Queue free_work to smc_close_wq instead of smc_hs_wq. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 02e3bc3834a7..6138b75e7503 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -309,7 +309,7 @@ static int __smc_release(struct smc_sock *smc) if (sk->sk_state == SMC_CLOSED) { sock_hold(sk); - if (!queue_work(smc_hs_wq, &smc->free_work)) + if (!queue_work(smc_close_wq, &smc->free_work)) sock_put(sk); } -- Gitee From d0872399a5b6b79323aa688c0fa25ae0d19354b4 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:06:46 +0800 Subject: [PATCH 49/76] anolis: net/smc: do not use free work if fallback ANBZ: #1742 Do not use free work if fallback to shorten fallback smc_release process. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6138b75e7503..e59651c4ca3d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -308,9 +308,17 @@ static int __smc_release(struct smc_sock *smc) sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { - sock_hold(sk); - if (!queue_work(smc_close_wq, &smc->free_work)) - sock_put(sk); + if (smc->clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } + + if (!smc->use_fallback) { + sock_hold(sk); + if (!queue_work(smc_close_wq, &smc->free_work)) + sock_put(sk); + } } return rc; @@ -381,16 +389,8 @@ static void smc_free_work(struct work_struct *work) sk = &smc->sk; lock_sock(sk); - if (sk->sk_state == SMC_CLOSED) { - if (smc->clcsock) { - release_sock(sk); - smc_clcsock_release(smc); - lock_sock(sk); - } - - if (!smc->use_fallback) - smc_conn_free(&smc->conn); - } + if (sk->sk_state == SMC_CLOSED && !smc->use_fallback) + smc_conn_free(&smc->conn); release_sock(sk); sock_put(sk); /* before queue */ -- Gitee From 0e3ab266e027b5365ac2855f9caa49be74d4e6d6 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:07:31 +0800 Subject: [PATCH 50/76] anolis: net/smc: only add wait queue when smc_accept_dequeue get null ANBZ: #1742 Only add wait queue when smc_accept_dequeue get null. And change wait queue api from add_wait_queue_exclusive to prepare_to_wait_exclusive as inet_csk_wait_for_connect did. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e59651c4ca3d..567994f16360 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2651,9 +2651,10 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *sk = sock->sk, *nsk; - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct smc_sock *lsmc; long timeo; + bool waited = false; int rc = 0; lsmc = smc_sk(sk); @@ -2666,15 +2667,16 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, goto out; } - /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { - set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } + /* Wait for an incoming connection */ + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + waited = true; release_sock(sk); timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ @@ -2685,8 +2687,9 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, break; } } - set_current_state(TASK_RUNNING); - remove_wait_queue(sk_sleep(sk), &wait); + + if (waited) + finish_wait(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); -- Gitee From 642943df478f6f5aebf441a40deb8b3a3ba304d3 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:07:39 +0800 Subject: [PATCH 51/76] anolis: net/smc: double check whether accept queue is empty before schedule_timeout ANBZ: #1742 Double check whether accept queue is empty before schedule_timeout. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 567994f16360..fcbfbb0c6c2c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1812,6 +1812,11 @@ static void smc_accept_unlink(struct sock *sk) sock_put(sk); /* sock_hold in smc_accept_enqueue */ } +static inline bool smc_accept_queue_empty(struct sock *sk) +{ + return list_empty(&smc_sk(sk)->accept_q); +} + /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ @@ -2678,7 +2683,8 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, TASK_INTERRUPTIBLE); waited = true; release_sock(sk); - timeo = schedule_timeout(timeo); + if (smc_accept_queue_empty(sk)) + timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); -- Gitee From 5bb5016ad93caf37c2044e7a004f0035859b0763 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:01 +0800 Subject: [PATCH 52/76] anolis: net/smc: optimize for smc_accept_poll ANBZ: #1742 It is no need to lock accept_q_lock when checking accept_q is empty or not. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fcbfbb0c6c2c..d6a6622a7c23 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2812,17 +2812,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return rc; } -static __poll_t smc_accept_poll(struct sock *parent) +static inline __poll_t smc_accept_poll(struct sock *parent) { - struct smc_sock *isk = smc_sk(parent); - __poll_t mask = 0; - - spin_lock(&isk->accept_q_lock); - if (!list_empty(&isk->accept_q)) - mask = EPOLLIN | EPOLLRDNORM; - spin_unlock(&isk->accept_q_lock); + if (!smc_accept_queue_empty(parent)) + return EPOLLIN | EPOLLRDNORM; - return mask; + return 0; } static __poll_t smc_poll(struct file *file, struct socket *sock, -- Gitee From ef69aaf4b541df121c4b242740eaa8330a9d4c94 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 13:02:22 +0800 Subject: [PATCH 53/76] anolis: net/smc: remove sock lock in smc_tcp_listen_work ANBZ: #1742 Since lsk is held and will not be freed, and lsk is used to read some fields in it, it is no need to lock lsk in smc_tcp_listen_work. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d6a6622a7c23..ed2859a5c532 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1733,13 +1733,11 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; *new_smc = NULL; - lock_sock(lsk); goto out; } *new_smc = smc_sk(new_sk); @@ -1748,7 +1746,6 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); - lock_sock(lsk); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { @@ -2523,7 +2520,6 @@ static void smc_tcp_listen_work(struct work_struct *work) struct smc_sock *new_smc; int rc = 0; - lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ @@ -2542,14 +2538,12 @@ static void smc_tcp_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(new_smc->clcsock->sk)->syn_smc) { - release_sock(lsk); sock_hold(&new_smc->sk); /* sock_put in passive closing */ rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); if (rc) smc_listen_out_err(new_smc); else smc_listen_out_connected(new_smc); - lock_sock(lsk); } else { new_smc->smc_negotiated = 1; atomic_inc(&lsmc->queued_smc_hs); @@ -2562,7 +2556,6 @@ static void smc_tcp_listen_work(struct work_struct *work) } out: - release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -- Gitee From c4193461685310fb6b53e1cf5ae6adb6c09be85a Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 13:02:34 +0800 Subject: [PATCH 54/76] anolis: net/smc: move sk_acceptq_{removed,add} into accept_q_lock's protection ANBZ: #1742 Move sk_acceptq_{removed,add} into accept_q_lock's protection. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ed2859a5c532..94e7f8ef7cc4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1793,8 +1793,8 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); - spin_unlock(&par->accept_q_lock); sk_acceptq_added(parent); + spin_unlock(&par->accept_q_lock); } /* remove a socket from the accept queue of its parental listening socket */ @@ -1804,8 +1804,8 @@ static void smc_accept_unlink(struct sock *sk) spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); - spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + spin_unlock(&par->accept_q_lock); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } -- Gitee From f7bab48fbbf68835e129eefb14ad3aac24c60d7a Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 12:09:16 +0800 Subject: [PATCH 55/76] anolis: net/smc: smc_sock_alloc after kernel_accept ANBZ: #1742 Execution of smc_sock_alloc and free sock is a waste of CPU when kernel_accept fails. As the success probability of smc sock_alloc is higher than that of kernel_accept, it is more reasonable to first kernel_accept and then smc_sock_alloc. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 94e7f8ef7cc4..0793895d6f9f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1733,33 +1733,22 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); - if (!new_sk) { - rc = -ENOMEM; - lsk->sk_err = ENOMEM; - *new_smc = NULL; - goto out; - } - *new_smc = smc_sk(new_sk); - mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; - if (rc < 0 || lsk->sk_state == SMC_CLOSED) { - new_sk->sk_prot->unhash(new_sk); - mutex_lock(&lsmc->clcsock_release_lock); - if (new_clcsock) - sock_release(new_clcsock); - new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); - mutex_unlock(&lsmc->clcsock_release_lock); - sock_put(new_sk); /* final */ - *new_smc = NULL; - goto out; + if (rc < 0 || lsk->sk_state == SMC_CLOSED) + goto err_out; + + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); + if (!new_sk) { + rc = -ENOMEM; + lsk->sk_err = ENOMEM; + goto err_out; } + *new_smc = smc_sk(new_sk); /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function @@ -1779,7 +1768,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) } (*new_smc)->clcsock = new_clcsock; -out: + + return 0; +err_out: + *new_smc = NULL; + if (new_clcsock) + sock_release(new_clcsock); return rc; } -- Gitee From 84b8d0676a81cf8ed33b0009ed1a9fab0ac97000 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:25 +0800 Subject: [PATCH 56/76] anolis: net/smc: Introduce multiple tcp listen works to enhance tcp_listen_work ANBZ: #1742 Introduce multiple tcp listen works to enhance tcp_listen_work, as each tcp listen work can be enqueued independently to workqueue and can be executed concurrently. Since kernel_accept cannot accept concurrently, too many tcp listen works will only lead to excessive kernel_accept competition and waste CPU, the number of the tcp listen works is now set to 2, which has been tested to be the best performance. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 16 ++++++++++++---- net/smc/smc.h | 11 ++++++++++- net/smc/smc_close.c | 4 +++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0793895d6f9f..5031555287ab 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -402,6 +402,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, struct smc_sock *smc; struct proto *prot; struct sock *sk; + int i = 0; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); @@ -415,7 +416,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { + smc->tcp_listen_works[i].smc = smc; + INIT_WORK(&smc->tcp_listen_works[i].work, smc_tcp_listen_work); + } + atomic_set(&smc->tcp_listen_work_seq, 0); INIT_WORK(&smc->free_work, smc_free_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -2508,8 +2513,9 @@ static void smc_listen_work(struct work_struct *work) static void smc_tcp_listen_work(struct work_struct *work) { - struct smc_sock *lsmc = container_of(work, struct smc_sock, - tcp_listen_work); + struct smc_tcp_listen_work *twork = + container_of(work, struct smc_tcp_listen_work, work); + struct smc_sock *lsmc = twork->smc; struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; @@ -2563,8 +2569,10 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { + int idx = atomic_fetch_inc(&lsmc->tcp_listen_work_seq) % + SMC_MAX_TCP_LISTEN_WORKS; sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_works[idx].work)) sock_put(&lsmc->sk); } out: diff --git a/net/smc/smc.h b/net/smc/smc.h index ffbe3ecc2fb4..7c20cded9be4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -240,6 +240,13 @@ struct smc_connection { u8 out_of_sync : 1; /* out of sync with peer */ }; +#define SMC_MAX_TCP_LISTEN_WORKS 2 + +struct smc_tcp_listen_work { + struct smc_sock *smc; + struct work_struct work; +}; + struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ @@ -254,7 +261,9 @@ struct smc_sock { /* smc sock container */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ - struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct smc_tcp_listen_work tcp_listen_works[SMC_MAX_TCP_LISTEN_WORKS]; + /* handle tcp socket accepts */ + atomic_t tcp_listen_work_seq;/* used to select tcp_listen_works */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct work_struct free_work; /* free smc conn */ struct list_head accept_q; /* sockets to be accepted */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 31db7438857c..857e30dfe154 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -199,6 +199,7 @@ int smc_close_active(struct smc_sock *smc) long timeout; int rc = 0; int rc1 = 0; + int i = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? @@ -223,7 +224,8 @@ int smc_close_active(struct smc_sock *smc) } smc_close_cleanup_listen(sk); release_sock(sk); - flush_work(&smc->tcp_listen_work); + for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) + flush_work(&smc->tcp_listen_works[i].work); lock_sock(sk); break; case SMC_ACTIVE: -- Gitee From 3f5745e8c16840ab465a6d174a251bf842d95627 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:35 +0800 Subject: [PATCH 57/76] anolis: net/smc: only cancel connect_work when connect nonblock ANBZ: #1742 In smc_release, only cancel connect_work when connect nonblock, as connect whithout NON_BLOCKING flag will not queue connect work. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5031555287ab..9b53b3c8bb1a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -342,7 +342,7 @@ static int smc_release(struct socket *sock) if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - if (cancel_work_sync(&smc->connect_work)) + if (smc->connect_nonblock && cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) -- Gitee From a5eadecb4caab43ad2f62c88b18650929e11858b Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:42 +0800 Subject: [PATCH 58/76] anolis: net/smc: do not call cancel smc_listen_work when smc use fallback ANBZ: #1742 In smc_clcsock_release, do not call cancel smc_listen_work when smc use fallback, as fallback smc will not queue smc_listen_work to workqueue. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/smc_close.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 857e30dfe154..a10d762ee5d7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -25,7 +25,8 @@ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; - if (smc->listen_smc && current_work() != &smc->smc_listen_work) + if (smc->listen_smc && !smc->use_fallback && + current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); if (smc->clcsock) { -- Gitee From 23ba9bdf8b9b81e471b523542715250831e940a7 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:49 +0800 Subject: [PATCH 59/76] anolis: net/smc: check sk_ack_backlog before kernel_accept ANBZ: #1742 If sock accept queue is empty, kernel_accept will sock_create_lite and quickly then sock_release, which is a waste of cpu. As sk_ack_backlog can indicate the accept queue's length, check the depth of the accept queue through sk_ack_backlog to decide whether to call kernel_accept, which can avoid the aforementioned waste of CPU. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9b53b3c8bb1a..afbcbebe2131 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1739,8 +1739,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) int rc = -EINVAL; mutex_lock(&lsmc->clcsock_release_lock); - if (lsmc->clcsock) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + if (lsmc->clcsock) { + if (lsmc->clcsock->sk->sk_ack_backlog) + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + else + rc = -EAGAIN; + } mutex_unlock(&lsmc->clcsock_release_lock); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; -- Gitee From 3ead84b3187834abf34722144546d2d0d0d6dd4b Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 21 Jul 2022 11:08:56 +0800 Subject: [PATCH 60/76] anolis: net/smc: change clcsock_release_lock from mutex to rw_semaphore ANBZ: #1742 The lock names clcsock_release_lock is used to protect smc->clcsock from being released when in use. Since smc->clcsock is only released when smc_release and most of the access of smc->clcsock, replacing the lock to rw_semaphore is better than mutex. Signed-off-by: Guangguan Wang Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 --- net/smc/af_smc.c | 26 ++++++++++++++------------ net/smc/smc.h | 2 +- net/smc/smc_clc.c | 6 +++--- net/smc/smc_close.c | 4 ++-- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index afbcbebe2131..09657466dbd5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -429,7 +429,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); - mutex_init(&smc->clcsock_release_lock); + init_rwsem(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); /* default behavior from every net namespace */ @@ -923,7 +923,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; @@ -946,7 +946,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc_fback_replace_callbacks(smc); } out: - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return rc; } @@ -1738,14 +1738,14 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - mutex_lock(&lsmc->clcsock_release_lock); + down_read(&lsmc->clcsock_release_lock); if (lsmc->clcsock) { if (lsmc->clcsock->sk->sk_ack_backlog) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); else rc = -EAGAIN; } - mutex_unlock(&lsmc->clcsock_release_lock); + up_read(&lsmc->clcsock_release_lock); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) @@ -1832,10 +1832,12 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); + down_write(&isk->clcsock_release_lock); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } + up_write(&isk->clcsock_release_lock); sock_put(new_sk); /* final */ continue; } @@ -3029,9 +3031,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) @@ -3043,7 +3045,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -3109,19 +3111,19 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return rc; } diff --git a/net/smc/smc.h b/net/smc/smc.h index 7c20cded9be4..58eeeb45e237 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -297,7 +297,7 @@ struct smc_sock { /* smc sock container */ /* non-blocking connect in * flight */ - struct mutex clcsock_release_lock; + struct rw_semaphore clcsock_release_lock; /* protects clcsock of a listen * socket * */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 9a75119b3437..365831c683f1 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,13 +795,13 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock || !smc->clcsock->sk) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EPROTO; } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index a10d762ee5d7..2fe8cc94ba40 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -28,13 +28,13 @@ void smc_clcsock_release(struct smc_sock *smc) if (smc->listen_smc && !smc->use_fallback && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); - mutex_lock(&smc->clcsock_release_lock); + down_write(&smc->clcsock_release_lock); if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; sock_release(tcp); } - mutex_unlock(&smc->clcsock_release_lock); + up_write(&smc->clcsock_release_lock); } static void smc_close_cleanup_listen(struct sock *parent) -- Gitee From 5df090973d6c418e120a240c5034a13a2fd959eb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Tue, 2 Aug 2022 16:22:37 +0800 Subject: [PATCH 61/76] anolis: net/smc: Introduce smc dim ANBZ: #1879 Dynamic interrupt moderation can coalesce interrupts and reduce cpu utilization. Nginx benchmark shows 13.7% improvement in throughput: Server test command: smc_run nginx Client test command: smc_run /opt/wrk/wrk http://ip:port -t 32 -c 992 -d 30 --latency Before: Requests/sec: 2467285.48 After: Requests/sec: 2804945.57 Signed-off-by: Guangguan Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/623 Reviewed-by: D. Wythe Acked-by: Tony Lu --- net/smc/Makefile | 2 +- net/smc/smc_dim.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_dim.h | 34 +++++++ net/smc/smc_ib.c | 6 +- net/smc/smc_wr.c | 9 +- 5 files changed, 296 insertions(+), 3 deletions(-) create mode 100644 net/smc/smc_dim.c create mode 100644 net/smc/smc_dim.h diff --git a/net/smc/Makefile b/net/smc/Makefile index bd6f807ff803..f9935659a436 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o smc_conv.o +smc-y += smc_tracepoint.o smc_proc.o smc_conv.o smc_dim.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/smc_dim.c b/net/smc/smc_dim.c new file mode 100644 index 000000000000..280696beb54a --- /dev/null +++ b/net/smc/smc_dim.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022, Alibaba Group. + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "smc_dim.h" + +#define SMC_IS_SIGNIFICANT_DIFF(val, ref, threshold) \ + ((ref) && (((100UL * abs((val) - (ref))) / (ref)) >= (threshold))) + +#define SMC_CPMS_THRESHOLD 5 +#define SMC_CPERATIO_THRESHOLD 25 +#define SMC_MAX_FLUCTUATIONS 3 +#define CPU_IDLE_UTIL_THRESHOLD 5 +#define CPU_SOFTIRQ_UTIL_THRESHOLD 10 + +#define SMC_DIM_PARAMS_NUM_PROFILES 4 +#define SMC_DIM_START_PROFILE 0 + +static const struct dim_cq_moder +smc_dim_profile[SMC_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 2, 0}, + {4, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +static void smc_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; + + u16 usec = smc_dim_profile[dim->profile_ix].usec; + u16 comps = smc_dim_profile[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + cq->device->ops.modify_cq(cq, comps, usec); +} + +void smc_dim_init(struct ib_cq *cq) +{ + struct smc_dim *smc_dim; + struct dim *dim; + + if (!cq->device->ops.modify_cq) + return; + + smc_dim = kzalloc(sizeof(*smc_dim), GFP_KERNEL); + if (!smc_dim) + return; + + smc_dim->use_dim = cq->device->use_cq_dim; + dim = to_dim(smc_dim); + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = SMC_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + INIT_WORK(&dim->work, smc_dim_work); +} + +void smc_dim_destroy(struct ib_cq *cq) +{ + if (!cq->dim) + return; + + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); +} + +static inline void smc_dim_param_clear(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tired = 0; + dim->profile_ix = SMC_DIM_START_PROFILE; + dim->tune_state = DIM_GOING_RIGHT; +} + +static inline void smc_dim_reset(struct dim *dim) +{ + int prev_ix = dim->profile_ix; + + smc_dim_param_clear(dim); + if (prev_ix != dim->profile_ix) + schedule_work(&dim->work); + else + dim->state = DIM_START_MEASURE; +} + +static int smc_dim_step(struct dim *dim) +{ + if (dim->tune_state == DIM_GOING_RIGHT) { + if (dim->profile_ix == (SMC_DIM_PARAMS_NUM_PROFILES - 1)) + return DIM_ON_EDGE; + dim->profile_ix++; + dim->steps_right++; + } + if (dim->tune_state == DIM_GOING_LEFT) { + if (dim->profile_ix == 0) + return DIM_ON_EDGE; + dim->profile_ix--; + dim->steps_left++; + } + + return DIM_STEPPED; +} + +static int smc_dim_stats_compare(struct dim_stats *curr, struct dim_stats *prev) +{ + /* first stat */ + if (!prev->cpms) + return DIM_STATS_BETTER; + + if (SMC_IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms, SMC_CPMS_THRESHOLD)) + return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (SMC_IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio, SMC_CPERATIO_THRESHOLD)) + return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + return DIM_STATS_SAME; +} + +static void smc_dim_exit_parking(struct dim *dim) +{ + dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT; + smc_dim_step(dim); + dim->tired = 0; +} + +static bool smc_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +{ + int prev_state = dim->tune_state; + int prev_ix = dim->profile_ix; + int stats_res = smc_dim_stats_compare(curr_stats, + &dim->prev_stats); + + if (curr_stats->cpms < 50) { + smc_dim_param_clear(dim); + goto out; + } + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + if (stats_res != DIM_STATS_SAME) { + if (dim->tired++ > SMC_MAX_FLUCTUATIONS) + smc_dim_exit_parking(dim); + } else { + dim->tired = 0; + } + break; + case DIM_GOING_RIGHT: + case DIM_GOING_LEFT: + if (stats_res != DIM_STATS_BETTER) { + dim_turn(dim); + } else if (dim_on_top(dim)) { + dim_park_on_top(dim); + break; + } + + if (smc_dim_step(dim) == DIM_ON_EDGE) + dim_park_on_top(dim); + break; + } + +out: + if (prev_state != DIM_PARKING_ON_TOP || + dim->tune_state != DIM_PARKING_ON_TOP) + dim->prev_stats = *curr_stats; + + return dim->profile_ix != prev_ix; +} + +static bool smc_dim_check_utilization(struct dim *dim) +{ + struct smc_dim *smc_dim = to_smcdim(dim); + int cpu = smp_processor_id(); + struct kernel_cpustat kcpustat; + u32 idle_percent, softirq_percent; + u64 wall, wall_idle, diff_wall, softirq; + + wall_idle = get_cpu_idle_time(cpu, &wall, 1); + kcpustat_cpu_fetch(&kcpustat, cpu); + + softirq = div_u64(kcpustat_field(&kcpustat, CPUTIME_SOFTIRQ, cpu), NSEC_PER_USEC); + diff_wall = wall - smc_dim->prev_wall; + idle_percent = div64_u64(100 * (wall_idle - smc_dim->prev_idle), diff_wall); + softirq_percent = div64_u64(100 * (softirq - smc_dim->prev_softirq), diff_wall); + + smc_dim->prev_softirq = softirq; + smc_dim->prev_idle = wall_idle; + smc_dim->prev_wall = wall; + + return idle_percent < CPU_IDLE_UTIL_THRESHOLD && + softirq_percent >= CPU_SOFTIRQ_UTIL_THRESHOLD; +} + +void smc_dim(struct dim *dim, u64 completions) +{ + struct ib_cq *cq = dim->priv; + struct smc_dim *smc_dim = to_smcdim(dim); + struct dim_sample *curr_sample = &dim->measuring_sample; + struct dim_stats curr_stats; + u32 nevents; + + if (unlikely(smc_dim->use_dim != cq->device->use_cq_dim)) { + smc_dim->use_dim = cq->device->use_cq_dim; + if (!smc_dim->use_dim) + smc_dim_reset(dim); + } + + if (!smc_dim->use_dim) + return; + + dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0, + curr_sample->comp_ctr + completions, + &dim->measuring_sample); + + switch (dim->state) { + case DIM_MEASURE_IN_PROGRESS: + nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; + if (nevents < DIM_NEVENTS) + break; + if (!smc_dim_check_utilization(dim)) { + smc_dim_reset(dim); + break; + } + dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); + if (smc_dim_decision(&curr_stats, dim)) { + dim->state = DIM_APPLY_NEW_PROFILE; + schedule_work(&dim->work); + break; + } + fallthrough; + case DIM_START_MEASURE: + dim->state = DIM_MEASURE_IN_PROGRESS; + dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0, + curr_sample->comp_ctr, + &dim->start_sample); + break; + case DIM_APPLY_NEW_PROFILE: + break; + } +} diff --git a/net/smc/smc_dim.h b/net/smc/smc_dim.h new file mode 100644 index 000000000000..bc8175f7b708 --- /dev/null +++ b/net/smc/smc_dim.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022, Alibaba Group. + */ + +#ifndef _SMC_DIM_H +#define _SMC_DIM_H + +#include +#include + +struct smc_dim { + struct dim dim; + bool use_dim; + u64 prev_idle; + u64 prev_softirq; + u64 prev_wall; +}; + +static inline struct smc_dim *to_smcdim(struct dim *dim) +{ + return (struct smc_dim *)dim; +} + +static inline struct dim *to_dim(struct smc_dim *smcdim) +{ + return (struct dim *)smcdim; +} + +void smc_dim_init(struct ib_cq *cq); +void smc_dim_destroy(struct ib_cq *cq); +void smc_dim(struct dim *dim, u64 completions); + +#endif /* _SMC_DIM_H */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 6c2bc2849058..8d0454737c69 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -27,6 +27,7 @@ #include "smc_wr.h" #include "smc.h" #include "smc_netlink.h" +#include "smc_dim.h" #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ @@ -870,8 +871,10 @@ static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) int i; for (i = 0; i < smcibdev->num_cq; i++) { - if (smcibdev->smcibcq[i].ib_cq) + if (smcibdev->smcibcq[i].ib_cq) { + smc_dim_destroy(smcibdev->smcibcq[i].ib_cq); ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); + } } kfree(smcibdev->smcibcq); @@ -917,6 +920,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto err; } + smc_dim_init(smcibcq->ib_cq); rc = ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP); if (rc) goto err; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab183684c954..2d400a0a13f4 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -30,6 +30,7 @@ #include "smc.h" #include "smc_wr.h" +#include "smc_dim.h" #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ @@ -459,7 +460,7 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc; + int i, rc, completed = 0; again: do { @@ -480,6 +481,9 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) break; } } + + if (rc > 0) + completed += rc; } while (rc > 0); /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, @@ -490,6 +494,9 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; + + if (smcibcq->ib_cq->dim) + smc_dim(smcibcq->ib_cq->dim, completed); } void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) -- Gitee From 9d5105f892126f652f215a6acbddac02c47e48a5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 22 Jul 2022 14:05:14 +0800 Subject: [PATCH 62/76] anolis: net/smc: skip smc_llc_flow_initiate while SMC-1RTT ANBZ: #1868 Under SMC-R 1rtt, confirm_rkey phase is skipped, hence it is not necessary to start a llc flow by smc_llc_flow_initiate. Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/618 --- net/smc/af_smc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 09657466dbd5..47f568fc6fb0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -564,9 +564,11 @@ static int smcr_lgr_reg_rmbs(struct smc_sock *smc, struct smc_link_group *lgr = link->lgr; int i, lnk = 0, rc = 0; - rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); - if (rc) - return rc; + if (!smc->simplify_rkey_exhcange) { + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + } /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ @@ -594,7 +596,8 @@ static int smcr_lgr_reg_rmbs(struct smc_sock *smc, rmb_desc->is_conf_rkey = true; out: mutex_unlock(&lgr->llc_conf_mutex); - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + if (!smc->simplify_rkey_exhcange) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } -- Gitee From 996bbfa6579171f3c339a1599a73df4fce98ab2e Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 2 Aug 2022 17:18:53 +0800 Subject: [PATCH 63/76] anolis: net/smc: Fix NULL pointer in smc_getname while first contact clcsock release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #1868 when first conatct connection fails, there will be: smc_listen_work: (first contact) smc_listen_decline /* recv peer decline */ smc_conn_abort smc_link_put smc_lgr_cleanup_early smc_link_clear smc_link_put sock_release(lnk->clcsock) smc_listen_out() nginx: accept smc_getname smc->clcsock->ops In case above, smc_getname will panic dues to NULL ops. Fixes: fe235e2b("net/smc: Keep first contact clcsock") Signed-off-by: D. Wythe Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/618 --- net/smc/af_smc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 47f568fc6fb0..821c18da1e87 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2734,14 +2734,20 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct smc_sock *smc; + int r = -ENOTCONN; if (peer && (sock->sk->sk_state != SMC_ACTIVE) && (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) - return -ENOTCONN; + goto out; smc = smc_sk(sock->sk); + down_read(&smc->clcsock_release_lock); + if (smc->clcsock && smc->clcsock->ops) + r = smc->clcsock->ops->getname(smc->clcsock, addr, peer); + up_read(&smc->clcsock_release_lock); - return smc->clcsock->ops->getname(smc->clcsock, addr, peer); +out: + return r; } static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) -- Gitee From 18b7752588b79353078d8a31d7bb4e8bc0c24ae5 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 14 Feb 2022 16:31:21 +0800 Subject: [PATCH 64/76] anolis: net/smc: Keep first contact clcsock ANBZ: #2003 This introduces a work around for eRDMA. eRDMA reuse the first TCP tuple to create QP, and don't want to release it. But SMC will release this tuple when first contact connection is shutdown. This patch keeps the first contact connection, and delay the shutdown work to link (QP) release progress. Be careful, this patch reverses TCP close process, which means server side closes clcsock when link clears. Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: Tony Lu Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- include/net/netns/smc.h | 1 + net/smc/af_smc.c | 6 ++++-- net/smc/smc.h | 1 + net/smc/smc_close.c | 7 +++++-- net/smc/smc_core.c | 9 +++++++++ net/smc/smc_core.h | 1 + net/smc/smc_sysctl.c | 10 ++++++++++ 7 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 8124f59f856e..647233c0f423 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -32,6 +32,7 @@ struct netns_smc { int sysctl_tcp2smc; int sysctl_allow_different_subnet; int sysctl_disable_multiple_link; + int sysctl_keep_first_contact_clcsock; /* allow simplify rkey exchange when single link */ unsigned int sysctl_simplify_rkey_exhcange; unsigned int sysctl_smc_fastopen; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 821c18da1e87..c8e760219801 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -416,6 +416,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); + smc->keep_clcsock = false; for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { smc->tcp_listen_works[i].smc = smc; INIT_WORK(&smc->tcp_listen_works[i].work, smc_tcp_listen_work); @@ -1837,7 +1838,8 @@ struct sock *smc_accept_dequeue(struct sock *parent, new_sk->sk_prot->unhash(new_sk); down_write(&isk->clcsock_release_lock); if (isk->clcsock) { - sock_release(isk->clcsock); + if (!isk->keep_clcsock) + sock_release(isk->clcsock); isk->clcsock = NULL; } up_write(&isk->clcsock_release_lock); @@ -2944,7 +2946,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock) + if (do_shutdown && smc->clcsock && !smc->keep_clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; diff --git a/net/smc/smc.h b/net/smc/smc.h index 58eeeb45e237..7e946c9e3099 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -260,6 +260,7 @@ struct smc_sock { /* smc sock container */ /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ + bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ struct smc_tcp_listen_work tcp_listen_works[SMC_MAX_TCP_LISTEN_WORKS]; /* handle tcp socket accepts */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 2fe8cc94ba40..74321f6b2230 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -29,10 +29,12 @@ void smc_clcsock_release(struct smc_sock *smc) current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); down_write(&smc->clcsock_release_lock); + /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; - sock_release(tcp); + if (!smc->keep_clcsock) + sock_release(tcp); } up_write(&smc->clcsock_release_lock); } @@ -242,7 +244,8 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk) { + if (smc->clcsock && smc->clcsock->sk && + !smc->keep_clcsock) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8c6ad5c5fbec..175bb512fd50 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -917,6 +917,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); + if (smc->keep_clcsock) + lnk->clcsock = smc->clcsock; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -1262,13 +1264,17 @@ static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; + struct socket *clcsock; smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); + clcsock = lnk->clcsock; put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; + if (clcsock) + sock_release(clcsock); if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ @@ -1946,6 +1952,9 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { + /* keep this clcsock for QP reuse */ + if (net->smc.sysctl_keep_first_contact_clcsock) + smc->keep_clcsock = true; rc = smc_lgr_create(smc, ini); if (rc) goto out; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1bcd099a3a97..d92332436a07 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -167,6 +167,7 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + struct socket *clcsock; /* keep for eRDMA */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index aaba83d0c940..e583bf0be688 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -124,6 +124,15 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "keep_first_contact_clcsock", + .data = &init_net.smc.sysctl_keep_first_contact_clcsock, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -154,6 +163,7 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; + net->smc.sysctl_keep_first_contact_clcsock = 1; net->smc.sysctl_disable_multiple_link = 1; /* default on */ net->smc.sysctl_simplify_rkey_exhcange = 1; -- Gitee From 1404299d80a3d792927e6400f13c38f8091827bb Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Mon, 22 Aug 2022 22:14:20 +0800 Subject: [PATCH 65/76] anolis: net/smc: check clcsock->sk validity in fallback restore ANBZ: #2003 This patch checks validity of clcsock->sk when restoring clcsock's callback functions, in case that clcsock has been released by __smcr_link_clear in first contact situation. Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/af_smc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c8e760219801..b8c633a19926 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -260,6 +260,9 @@ static void smc_fback_restore_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; + if (!clcsk) + return; + write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = NULL; -- Gitee From 188ef0e850c543772120fedef07edb62f14fbb48 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 23 Aug 2022 22:57:56 +0800 Subject: [PATCH 66/76] anolis: net/smc: Fix wr tasklet handler for rx/tx wc ANBZ: #2003 This fixes it to handle wc event correctly and remove unnecessary print when receives unexpected wc. Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: Tony Lu Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_wr.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 2d400a0a13f4..ddd8379f2a52 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -467,19 +467,10 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); for (i = 0; i < rc; i++) { - switch (wc[i].opcode) { - case IB_WC_REG_MR: - case IB_WC_SEND: - smc_wr_tx_process_cqe(&wc[i]); - break; - case IB_WC_RECV: + if (wc[i].opcode & IB_WC_RECV) smc_wr_rx_process_cqe(&wc[i]); - break; - default: - pr_warn("smc: unexpected wc opcode %d, status %d, wr_id %llu.\n", - wc[i].opcode, wc[i].status, wc[i].wr_id); - break; - } + else + smc_wr_tx_process_cqe(&wc[i]); } if (rc > 0) -- Gitee From a2888a540ac5c38bae60b8d377bb9dca149bb01e Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Wed, 24 Aug 2022 15:49:32 +0800 Subject: [PATCH 67/76] anolis: net/smc: Use SO_REUSEADDR if keep clcsock ANBZ: #2003 This use SO_REUSEADDR to avoid address conflict when enabling keep_first_contact. Also it only applies to clcsock instead of smc sk, and it restores when fallback. Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: Tony Lu Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/af_smc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b8c633a19926..86b1d06e65fd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -475,7 +475,12 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; - smc->clcsock->sk->sk_reuse = sk->sk_reuse; + /* use SO_REUSEADDR to keep first contact clcsock */ + if (sock_net(sk)->smc.sysctl_keep_first_contact_clcsock) + smc->clcsock->sk->sk_reuse = SK_CAN_REUSE; + else + smc->clcsock->sk->sk_reuse = sk->sk_reuse; + smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); @@ -945,6 +950,10 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + /* restore sk_reuse which is SK_CAN_REUSE when + * sysctl_keep_first_contact_clcsock enabled. + */ + smc->clcsock->sk->sk_reuse = smc->sk.sk_reuse; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up -- Gitee From beea1a904b9c32058bd67664730f422766c9e7cd Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sun, 28 Aug 2022 15:19:53 +0800 Subject: [PATCH 68/76] anolis: net/smc: delay RDMA resource release until connecitons freed ANBZ: #2003 When SMC-R links down happened, the RDMA resources like mr, QP, PD are destroyed immediately. However, there might be some SMC-R connections still invoke them, which causes an use-after-free panic. So this patch delays the RDMA resources release until SMC-R connections are all freed. Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 175bb512fd50..4d001500da09 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1266,6 +1266,9 @@ static void __smcr_link_clear(struct smc_link *lnk) struct smc_ib_device *smcibdev; struct socket *clcsock; + smcr_buf_unmap_lgr(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); clcsock = lnk->clcsock; @@ -1289,12 +1292,9 @@ void smcr_link_clear(struct smc_link *lnk, bool log) lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); - smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); - smc_ib_destroy_queue_pair(lnk); - smc_ib_dealloc_protection_domain(lnk); smcr_link_put(lnk); /* theoretically last link_put */ } -- Gitee From 73058152b5b9f90208758d038799f02e4b373f6c Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Mon, 29 Aug 2022 15:15:10 +0800 Subject: [PATCH 69/76] anolis: net/smc: Reuse wr_id to recognize wc for tx or rx ANBZ: #2003 Currently, SMC sq and rq is combined to improve performance. It works good except when QP entered into error state. The RDMA device returns wc with status IB_WC_WR_FLUSH_ERR, but opcode of wc is always 0. So we use wr_id by parity to recognize tx or rx. Fixes: a0aa581d0179 ("anolis: net/smc: combine send cq and recv cq into one cq") Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: Tony Lu Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_wr.c | 5 +++-- net/smc/smc_wr.h | 10 ++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ddd8379f2a52..61bba5f69524 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -467,7 +467,8 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); for (i = 0; i < rc; i++) { - if (wc[i].opcode & IB_WC_RECV) + if ((wc[i].opcode & IB_WC_RECV) || + (wc[i].opcode == 0 && smc_wr_id_is_rx(wc[i].wr_id))) smc_wr_rx_process_cqe(&wc[i]); else smc_wr_tx_process_cqe(&wc[i]); @@ -834,7 +835,7 @@ int smc_wr_create_link(struct smc_link *lnk) int rc = 0; smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); - lnk->wr_rx_id = 0; + lnk->wr_rx_id = 1; lnk->wr_rx_dma_addr = ib_dma_map_single( ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index ce338e1ca6c2..b18986fda110 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -56,7 +56,7 @@ struct smc_wr_rx_handler { */ static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link) { - return atomic_long_inc_return(&link->wr_tx_id); + return atomic_long_add_return(2, &link->wr_tx_id); } static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) @@ -151,7 +151,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) u64 wr_id, temp_wr_id; u32 index; - wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */ + link->wr_rx_id += 2; + wr_id = link->wr_rx_id; /* tasklet context, thus not atomic */ temp_wr_id = wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; @@ -161,6 +162,11 @@ static inline int smc_wr_rx_post(struct smc_link *link) return rc; } +static inline bool smc_wr_id_is_rx(u64 wr_id) +{ + return wr_id % 2; +} + int smc_wr_create_link(struct smc_link *lnk); int smc_wr_alloc_link_mem(struct smc_link *lnk); int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); -- Gitee From 0d8604ba7fa01188fa54ccec60202e51c9f1639d Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 29 Aug 2022 17:48:29 +0800 Subject: [PATCH 70/76] anolis: net/smc: fix deadlock when lgr terminating ANBZ: #2003 A potential deadlock may occur in the following scenarios: smc_cdc_tx_handler __smc_lgr_terminate smc_tx_pending atomic_inc(&conn->cdc_pend_tx_wr); rc = smc_wr_tx_send lock_sock smc_conn_kill smc_cdc_wait_pend_tx_wr cdc_pend_tx_wr = 1 if (rc) atomic_dec(&conn->cdc_pend_tx_wr); /* dead wait, no change to wake up */ wait_event(cdc_pend_tx_wq) In order to prevent this scenario, we need to check at all subtraction operations, and try to wake up on every time it downs to zero. Fixes: a4de4839596b ("net/smc: fix kernel panic caused by race of smc_sock") Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: D. Wythe Reviewed-by: Tony Lu --- net/smc/smc_cdc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 25b836df9f50..a0ce6908876b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -136,7 +136,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_wr_rx_put_credits(link, saved_credits); - atomic_dec(&conn->cdc_pend_tx_wr); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) + wake_up(&conn->cdc_pend_tx_wq); } return rc; @@ -168,8 +169,10 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (unlikely(rc)) - atomic_dec(&conn->cdc_pend_tx_wr); + if (unlikely(rc)) { + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) + wake_up(&conn->cdc_pend_tx_wq); + } return rc; } -- Gitee From cfa400a406167e97c654f191aca99303e4d1286c Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 29 Aug 2022 19:17:07 +0800 Subject: [PATCH 71/76] anolis: net/smc: fix panic smc_tcp_syn_recv_sock() while closing listen socket ANBZ: #2003 Consider the following scenarios: smc_release smc_close_active write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = NULL; write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_tcp_syn_recv_sock smc = smc_clcsock_user_data(sk); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); Hence, we may read the a NULL value in smc_tcp_syn_recv_sock(). Note that the function has already been protected by sk_callback_lock, therefore, we only need to judge whether if it is empty. Fixes: cc5bfad941b1 ("net/smc: Limit SMC visits when handshake workqueue congested") Link: https://gitee.com/anolis/cloud-kernel/pulls/677 Signed-off-by: D. Wythe Reviewed-by: Tony Lu --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 86b1d06e65fd..3a483219fff9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -135,6 +135,8 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct sock *child; smc = smc_clcsock_user_data(sk); + if (unlikely(!smc)) + goto drop; if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) -- Gitee From 8438d09581acda333d8046dfc97a46202ff6d437 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 31 Aug 2022 11:07:54 +0800 Subject: [PATCH 72/76] anolis: net/smc: Fix task hung in SMC-R terminating all link groups ANBZ: #2045 When performing a stress test on SMC-R by rmmod mlx5_ib driver during the wrk/nginx test, we found that there is a probability of triggering a task hung in terminating all link groups. This issue is caused by the race between smc_smcr_terminate_all() and smc_lgr_create(). smc_smcr_terminate_all() | smc_lgr_create --------------------------------------------------------------- for links in smcibdev | schedule links down | | smcr_link_init | \- smcibdev->lnk_cnt++ | wait for smcibdev->lnk_cnt == 0 | (hung until new lgr free) | This patch tries to fix the issue by preventing new link group creation during IB driver remove. Fixes: 0b29ec643613 ("net/smc: immediate termination for SMCR link groups") Link: https://gitee.com/anolis/cloud-kernel/pulls/688 Signed-off-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_core.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4d001500da09..7ea86002c855 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -831,11 +831,13 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { + struct smc_ib_device *ibdev; struct smc_link_group *lgr; struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; + int ibport; int rc = 0; int i; @@ -889,9 +891,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ - struct smc_ib_device *ibdev; - int ibport; - lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->smc_version = ini->smcr_version; memcpy(lgr->peer_systemid, ini->peer_systemid, @@ -907,6 +906,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) ibdev = ini->ib_dev; ibport = ini->ib_port; } + mutex_lock(&smc_ib_devices.mutex); + if (list_empty(&ibdev->list) || + test_bit(ibport, ibdev->ports_going_away)) { + /* ibdev unavailable */ + rc = SMC_CLC_DECL_NOSMCRDEV; + goto free_wq; + } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); rc = smc_wr_alloc_lgr_mem(lgr); @@ -935,9 +941,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); + if (!ini->is_smcd) + mutex_unlock(&smc_ib_devices.mutex); return 0; free_wq: + if (!ini->is_smcd) + mutex_unlock(&smc_ib_devices.mutex); destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); @@ -946,10 +956,16 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { - if (rc == -ENOMEM) + switch (rc) { + case -ENOMEM: rc = SMC_CLC_DECL_MEM; - else + break; + case SMC_CLC_DECL_NOSMCRDEV: + break; + default: rc = SMC_CLC_DECL_INTERR; + break; + } } return rc; } -- Gitee From 863e6c09a008a313449f50064f78131af4c2c159 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 2 Sep 2022 00:03:02 +0800 Subject: [PATCH 73/76] anolis: net/smc: Restore rx queue depth ANBZ: #2045 This restore the rx queue depth when divides wr_id by parity. In order to help rq credit recover as quickly as possible to avoid potential timeout when tx get free slot, especially in short-live connection cases. Fixes: dea7289b8bb2 ("anolis: net/smc: Reuse wr_id to recognize wc for tx or rx") Link: https://gitee.com/anolis/cloud-kernel/pulls/688 Signed-off-by: Tony Lu Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_wr.c | 5 ++--- net/smc/smc_wr.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 61bba5f69524..a4b9ba6532f9 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -406,7 +406,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) if (wc->byte_len < sizeof(*wr_rx)) return; /* short message */ - temp_wr_id = wc->wr_id; + temp_wr_id = wc->wr_id / 2; index = do_div(temp_wr_id, link->wr_rx_cnt); wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { @@ -467,8 +467,7 @@ static void smc_wr_tasklet_fn(struct tasklet_struct *t) memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); for (i = 0; i < rc; i++) { - if ((wc[i].opcode & IB_WC_RECV) || - (wc[i].opcode == 0 && smc_wr_id_is_rx(wc[i].wr_id))) + if (smc_wr_id_is_rx(wc[i].wr_id)) smc_wr_rx_process_cqe(&wc[i]); else smc_wr_tx_process_cqe(&wc[i]); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index b18986fda110..7f3909b5ac64 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -153,7 +153,7 @@ static inline int smc_wr_rx_post(struct smc_link *link) link->wr_rx_id += 2; wr_id = link->wr_rx_id; /* tasklet context, thus not atomic */ - temp_wr_id = wr_id; + temp_wr_id = wr_id / 2; index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); -- Gitee From bf3068813cbe5f37f9ea20ac3525f0f4605dbdb8 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 2 Sep 2022 14:35:02 +0800 Subject: [PATCH 74/76] anolis: net/smc: Avoid use-after-free of smcibcq tasklet ANBZ: #2045 smc_ib_cleanup_cq() frees smcibcq and then smc_wr_remove_dev() accesses smcibcq->tasklet, thus causing an use-after-free issue. So this patch tries to fix it by reversing the order. Fixes: a0aa581d0179 ("anolis: net/smc: combine send cq and recv cq into one cq") Link: https://gitee.com/anolis/cloud-kernel/pulls/688 Reported-by: Yacan Liu Signed-off-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_ib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 8d0454737c69..9492365c3d05 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -876,6 +876,7 @@ static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); } } + smc_wr_remove_dev(smcibdev); kfree(smcibdev->smcibcq); } @@ -943,7 +944,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) goto out; smcibdev->initialized = 0; smc_ib_cleanup_cq(smcibdev); - smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); } -- Gitee From 0d9570471fb009ed649fe2ed2f7313c46f0782fe Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 2 Sep 2022 21:21:12 +0800 Subject: [PATCH 75/76] anolis: net/smc: Fix UAF when fallback after lgr create ANBZ: #2045 This fixes use-after-free issue when lgr creates failed or RDMA setup failed, it switch to fallback and should not release clcsock. Fixes: d6b4db06f3b8 ("anolis: net/smc: Keep first contact clcsock") Link: https://gitee.com/anolis/cloud-kernel/pulls/688 Signed-off-by: Tony Lu Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/af_smc.c | 4 +++- net/smc/smc_core.c | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3a483219fff9..dcbaf94bb495 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1023,8 +1023,10 @@ static void smc_conn_abort(struct smc_sock *smc, int local_first) lgr_valid = true; smc_conn_free(conn); - if (local_first && lgr_valid) + if (local_first && lgr_valid) { + smc->keep_clcsock = false; smc_lgr_cleanup_early(lgr); + } } /* check if there is a rdma device available for this connection. */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7ea86002c855..2086fabf016b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -627,11 +627,20 @@ int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) void smc_lgr_cleanup_early(struct smc_link_group *lgr) { + struct smc_link *link; spinlock_t *lgr_lock; + u8 link_idx; if (!lgr) return; + /* ONLY one link expected */ + link_idx = SMC_SINGLE_LINK; + link = &lgr->lnk[link_idx]; + if (link) + /* current is fallback, do not release clcsock */ + link->clcsock = NULL; + smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ @@ -1979,6 +1988,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) { + smc->keep_clcsock = false; smc_lgr_cleanup_early(lgr); goto out; } -- Gitee From 16afef456090e9278aaf3bfcf3cf8980203e1c9e Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 2 Sep 2022 20:08:36 +0800 Subject: [PATCH 76/76] anolis: net/smc: Fix NULL sndbuf_desc in smc_cdc_tx_handler() ANBZ: #2045 When performing a stress test on SMC-R by rmmod mlx5_ib driver during the wrk/nginx test, we found that there is a probability of triggering a panic while terminating all link groups. This issue dues to the race between smc_smcr_terminate_all() and smc_buf_create(). smc_smcr_terminate_all smc_buf_create __smc_lgr_terminate smc_conn_kill smc_close_abort smc_cdc_get_slot_and_msg_send __softirqentry_text_start smc_wr_tx_process_cqe smc_cdc_tx_handler READ(conn->sndbuf_desc->len); /* panic dues to NULL sndbuf_desc */ conn->sndbuf_desc = xxx; This patch tries to fix the issue by always to check the sndbuf_desc before send any cdc msg. Fixes: 0b29ec643613 ("net/smc: immediate termination for SMCR link groups") Link: https://gitee.com/anolis/cloud-kernel/pulls/688 Signed-off-by: D. Wythe Reviewed-by: Wen Gu Reviewed-by: Tony Lu --- net/smc/smc_cdc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a0ce6908876b..fc29948e245b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -116,6 +116,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, u8 saved_credits = 0; int rc; + if (unlikely(!READ_ONCE(conn->sndbuf_desc))) + return -EINVAL; + smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; -- Gitee