diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index c53f8c61c9e488c5945520f4dadce3caf3a994c7..a93857e580b0d56222d46cb70a412714fc41f3ff 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -21,3 +21,16 @@ autocorking_size - INTEGER know how/when to uncork their sockets. Default: 64K + +smcr_buf_type - INTEGER + Controls which type of sndbufs and RMBs to use in later newly created + SMC-R link group. Only for SMC-R. + + Default: 0 (physically contiguous sndbufs and RMBs) + + Possible values: + + - 0 - Use physically contiguous buffers + - 1 - Use virtually contiguous buffers + - 2 - Mixed use of the two types. Try physically contiguous buffers first. + If not available, use virtually contiguous buffers then. diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 135cfa9f42c449f6a094d71ed59f739454ce620f..5a888a25f527c395c496b12a7507dba2933616fd 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -25,11 +25,17 @@ struct netns_smc { struct ctl_table_header *smc_hdr; #endif unsigned int sysctl_autocorking_size; + unsigned int sysctl_smcr_buf_type; int sysctl_wmem_default; int sysctl_rmem_default; int sysctl_tcp2smc; int sysctl_allow_different_subnet; int sysctl_keep_first_contact_clcsock; int sysctl_disable_multiple_link; + /* allow simplify rkey exchange when single link */ + unsigned int sysctl_simplify_rkey_exhcange; + unsigned int sysctl_smc_fastopen; + /* use diff TCP experiment magic code */ + unsigned int sysctl_smc_experiments; }; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 94532793107f8d8da7df9ce73d703c8f62c3cc18..f791206af8918b54c46c01525c188ba2637f37ff 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -195,6 +195,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 +/* "SMCO" in EBCDIC encoding */ +#define TCPOPT_SMC_OK_MAGIC 0xE2D4C3D6 /* * TCP option lengths diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 759bcb2ff03effa84d434dd6bdafa0a3564fa2a2..4ec01eb8215e0575ad82732549900c69f26e2e21 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -125,6 +125,9 @@ enum { SMC_NLA_LGR_R_CONNS_NUM, /* u32 */ SMC_NLA_LGR_R_V2_COMMON, /* nest */ SMC_NLA_LGR_R_V2, /* nest */ + SMC_NLA_LGR_R_NET_COOKIE, /* u64 */ + SMC_NLA_LGR_R_PAD, /* flag */ + SMC_NLA_LGR_R_BUF_TYPE, /* u8 */ __SMC_NLA_LGR_R_MAX, SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1 }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b8227e6a78bdebb35ff13f25ba5b6b8d7e4ff3d6..f524a5f8f5521d42fe106f980b59a0b4f9d876dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3903,15 +3903,26 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, + const struct net *net, int opsize) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && - opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { - opt_rx->smc_ok = 1; - return true; + opsize >= TCPOLEN_EXP_SMC_BASE) { + /* syn ack */ + if (th->ack && net->smc.sysctl_smc_experiments) { + if (get_unaligned_be32(ptr) == TCPOPT_SMC_OK_MAGIC) { + opt_rx->smc_ok = 1; + return true; + } + return false; + } + /* syn only */ + if (get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { + opt_rx->smc_ok = 1; + return true; + } } } #endif @@ -4074,7 +4085,7 @@ void tcp_parse_options(const struct net *net, break; } - if (smc_parse_options(th, opt_rx, ptr, opsize)) + if (smc_parse_options(th, opt_rx, ptr, net, opsize)) break; opt_rx->saw_unknown = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 08f059a285f7c57c08aa07ea56b5e3101bba70b6..13a562ae567f94ee9d8df613c045d6397243e2b6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -416,6 +416,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) #define OPTION_MPTCP (1 << 10) +#define OPTION_SMC_OK BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -427,6 +428,12 @@ static void smc_options_write(__be32 *ptr, u16 *options) (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } else if (OPTION_SMC_OK & *options) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_OK_MAGIC); } } #endif @@ -726,10 +733,15 @@ static void smc_set_option_cond(const struct tcp_sock *tp, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) + const struct sock *sk; + + sk = &tp->inet_conn.icsk_inet.sk; + if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; + opts->options |= sock_net(sk)->smc.sysctl_smc_experiments ? + OPTION_SMC_OK : OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b59fe3958a2748820e3c67186c5a6b5c039e7979..9838ad187b2c1145413c0830f82451b73ed6c12c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -70,6 +70,15 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +static inline int smc_clcsock_enable_fastopen(struct smc_sock *smc, int is_server) +{ + int val = 1; + + return smc->clcsock->ops->setsockopt(smc->clcsock, SOL_TCP, + is_server ? TCP_FASTOPEN : TCP_FASTOPEN_CONNECT, + KERNEL_SOCKPTR(&val), sizeof(val)); +} + int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -305,8 +314,12 @@ static int __smc_release(struct smc_sock *smc) smc_clcsock_release(smc); lock_sock(sk); } - if (!smc->use_fallback) - smc_conn_free(&smc->conn); + + if (!smc->use_fallback) { + sock_hold(sk); + if (!queue_work(smc_close_wq, &smc->free_work)) + sock_put(sk); + } } return rc; @@ -330,7 +343,7 @@ static int smc_release(struct socket *sock) if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - if (cancel_work_sync(&smc->connect_work)) + if (smc->connect_nonblock && cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) @@ -368,12 +381,29 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } +static void smc_free_work(struct work_struct *work) +{ + struct sock *sk; + struct smc_sock *smc = container_of(work, struct smc_sock, + free_work); + + sk = &smc->sk; + + lock_sock(sk); + if (sk->sk_state == SMC_CLOSED && !smc->use_fallback) + smc_conn_free(&smc->conn); + release_sock(sk); + + sock_put(sk); /* before queue */ +} + static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { struct smc_sock *smc; struct proto *prot; struct sock *sk; + int i = 0; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); @@ -388,7 +418,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); smc->keep_clcsock = 0; - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) { + smc->tcp_listen_works[i].smc = smc; + INIT_WORK(&smc->tcp_listen_works[i].work, smc_tcp_listen_work); + } + atomic_set(&smc->tcp_listen_work_seq, 0); + INIT_WORK(&smc->free_work, smc_free_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); @@ -396,9 +431,13 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); - mutex_init(&smc->clcsock_release_lock); + init_rwsem(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); + /* default behavior from every net namespace */ + smc->simplify_rkey_exhcange = net->smc.sysctl_simplify_rkey_exhcange; + smc->smc_fastopen = net->smc.sysctl_smc_fastopen; + return sk; } @@ -495,38 +534,71 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } +/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; + } + mutex_unlock(&lgr->llc_conf_mutex); + return rc; +} + /* register the new rmb on all links */ -static int smcr_lgr_reg_rmbs(struct smc_link *link, +static int smcr_lgr_reg_rmbs(struct smc_sock *smc, struct smc_buf_desc *rmb_desc) { + struct smc_link *link = smc->conn.lnk; struct smc_link_group *lgr = link->lgr; - int i, rc = 0; + int i, lnk = 0, rc = 0; - rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); - if (rc) - return rc; + if (!smc->simplify_rkey_exhcange) { + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + } /* protect against parallel smc_llc_cli_rkey_exchange() and - * parallel smcr_link_reg_rmb() + * parallel smcr_link_reg_buf() */ mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; - rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; + /* available link count inc */ + lnk++; } - /* exchange confirm_rkey msg with peer */ - rc = smc_llc_do_confirm_rkey(link, rmb_desc); - if (rc) { - rc = -EFAULT; - goto out; + /* do not exchange confirm_rkey msg since there are only one link */ + if (lnk > 1 || !smc->simplify_rkey_exhcange) { + /* exchange confirm_rkey msg with peer */ + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; + } } + rmb_desc->is_conf_rkey = true; out: mutex_unlock(&lgr->llc_conf_mutex); - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + if (!smc->simplify_rkey_exhcange) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } @@ -559,8 +631,15 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; @@ -848,7 +927,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; @@ -871,7 +950,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc_fback_replace_callbacks(smc); } out: - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return rc; } @@ -1099,9 +1178,13 @@ static int smc_connect_clc(struct smc_sock *smc, rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; + + release_sock(&smc->sk); /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, - SMC_CLC_ACCEPT, CLC_WAIT_TIME); + rc = smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); + lock_sock(&smc->sk); + return rc; } void smc_fill_gid_list(struct smc_link_group *lgr, @@ -1182,10 +1265,8 @@ static int smc_connect_rdma(struct smc_sock *smc, if (reason_code) return reason_code; - mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); if (reason_code) { - mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -1244,12 +1325,18 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } - if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_ERR_REGRMB; + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } + if (smcr_lgr_reg_rmbs(smc, smc->conn.rmb_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } - smc_rmb_sync_sg_for_device(&smc->conn); if (aclc->hdr.version > SMC_V1) { struct smc_clc_msg_accept_confirm_v2 *clc_v2 = @@ -1276,7 +1363,6 @@ static int smc_connect_rdma(struct smc_sock *smc, if (reason_code) goto connect_abort; } - mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; @@ -1286,7 +1372,6 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); - mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; return reason_code; @@ -1510,6 +1595,11 @@ static void smc_connect_work(struct work_struct *work) if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; + + if (smc->smc_fastopen && + inet_sk(smc->clcsock->sk)->defer_connect) + goto defer_connect; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; @@ -1522,6 +1612,7 @@ static void smc_connect_work(struct work_struct *work) rc = 0; } release_sock(smc->clcsock->sk); +defer_connect: lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; @@ -1567,9 +1658,29 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; lock_sock(sk); + switch (sock->state) { + default: + rc = -EINVAL; + goto out; + case SS_CONNECTED: + rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; + goto out; + case SS_CONNECTING: + if (sk->sk_state == SMC_ACTIVE) + goto connected; + break; + case SS_UNCONNECTED: + sock->state = SS_CONNECTING; + break; + } + switch (sk->sk_state) { default: goto out; + case SMC_CLOSED: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; @@ -1577,31 +1688,44 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, break; } + if (!smc->clcsock || + (smc->clcsock && !smc->clcsock->sk)) { + rc = -EBADF; + goto out; + } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { rc = -EALREADY; goto out; } + + if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is_server */ 0)) + smc->smc_fastopen = 0; /* rollback when setsockopt failed */ + rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; - sock_hold(&smc->sk); /* sock put in passive closing */ - if (smc->use_fallback) + if (smc->use_fallback) { + sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; + } + sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; + goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; - else - rc = 0; /* success cases including fallback */ } +connected: + rc = 0; + sock->state = SS_CONNECTED; out: release_sock(sk); out_err: @@ -1615,35 +1739,27 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc = -EINVAL; - release_sock(lsk); + down_read(&lsmc->clcsock_release_lock); + if (lsmc->clcsock) { + if (lsmc->clcsock->sk->sk_ack_backlog) + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + else + rc = -EAGAIN; + } + up_read(&lsmc->clcsock_release_lock); + if (rc < 0 && rc != -EAGAIN) + lsk->sk_err = -rc; + if (rc < 0 || lsk->sk_state == SMC_CLOSED) + goto err_out; + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; - *new_smc = NULL; - lock_sock(lsk); - goto out; + goto err_out; } *new_smc = smc_sk(new_sk); - mutex_lock(&lsmc->clcsock_release_lock); - if (lsmc->clcsock) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); - mutex_unlock(&lsmc->clcsock_release_lock); - lock_sock(lsk); - if (rc < 0 && rc != -EAGAIN) - lsk->sk_err = -rc; - if (rc < 0 || lsk->sk_state == SMC_CLOSED) { - new_sk->sk_prot->unhash(new_sk); - if (new_clcsock) - sock_release(new_clcsock); - new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); - sock_put(new_sk); /* final */ - *new_smc = NULL; - goto out; - } - /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function */ @@ -1662,7 +1778,12 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) } (*new_smc)->clcsock = new_clcsock; -out: + + return 0; +err_out: + *new_smc = NULL; + if (new_clcsock) + sock_release(new_clcsock); return rc; } @@ -1676,8 +1797,8 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); - spin_unlock(&par->accept_q_lock); sk_acceptq_added(parent); + spin_unlock(&par->accept_q_lock); } /* remove a socket from the accept queue of its parental listening socket */ @@ -1687,11 +1808,16 @@ static void smc_accept_unlink(struct sock *sk) spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); - spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + spin_unlock(&par->accept_q_lock); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } +static inline bool smc_accept_queue_empty(struct sock *sk) +{ + return list_empty(&smc_sk(sk)->accept_q); +} + /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ @@ -1707,15 +1833,18 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); + down_write(&isk->clcsock_release_lock); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } + up_write(&isk->clcsock_release_lock); sock_put(new_sk); /* final */ continue; } if (new_sock) { sock_graft(new_sk, new_sock); + new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; @@ -1749,8 +1878,15 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); @@ -1792,7 +1928,7 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + if (new_smc->smc_negotiated) atomic_dec(&lsmc->queued_smc_hs); if (lsmc->sk.sk_state == SMC_LISTEN) { @@ -2111,10 +2247,15 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) struct smc_connection *conn = &new_smc->conn; if (!local_first) { - if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + if (smcr_lgr_reg_rmbs(new_smc, conn->rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; } - smc_rmb_sync_sg_for_device(&new_smc->conn); return 0; } @@ -2162,6 +2303,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, not_found: ini->smcr_version &= ~SMC_V2; + ini->smcrv2.ib_dev_v2 = NULL; ini->check_smcrv2 = false; } @@ -2288,16 +2430,6 @@ static void smc_listen_work(struct work_struct *work) return; } - /* check if peer is smc capable */ - if (!tcp_sk(newclcsock->sk)->syn_smc) { - rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - if (rc) - smc_listen_out_err(new_smc); - else - smc_listen_out_connected(new_smc); - return; - } - /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ @@ -2332,7 +2464,8 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; - mutex_lock(&smc_server_lgr_pending); + if (ini->is_smcd) + mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); @@ -2370,16 +2503,18 @@ static void smc_listen_work(struct work_struct *work) ini->first_contact_local, ini); if (rc) goto out_unlock; - mutex_unlock(&smc_server_lgr_pending); } + smc_conn_leave_rtoken_pending(new_smc, ini); smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: - mutex_unlock(&smc_server_lgr_pending); + if (ini->is_smcd) + mutex_unlock(&smc_server_lgr_pending); out_decl: + smc_conn_leave_rtoken_pending(new_smc, ini); smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); out_free: @@ -2389,13 +2524,13 @@ static void smc_listen_work(struct work_struct *work) static void smc_tcp_listen_work(struct work_struct *work) { - struct smc_sock *lsmc = container_of(work, struct smc_sock, - tcp_listen_work); + struct smc_tcp_listen_work *twork = + container_of(work, struct smc_tcp_listen_work, work); + struct smc_sock *lsmc = twork->smc; struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; - lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ @@ -2403,9 +2538,6 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) - atomic_inc(&lsmc->queued_smc_hs); - new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2414,13 +2546,27 @@ static void smc_tcp_listen_work(struct work_struct *work) smc_copy_sock_settings_to_smc(new_smc); new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; - sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) - sock_put(&new_smc->sk); + + /* check if peer is smc capable */ + if (!tcp_sk(new_smc->clcsock->sk)->syn_smc) { + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); + } else { + new_smc->smc_negotiated = 1; + atomic_inc(&lsmc->queued_smc_hs); + /* memory barrier */ + smp_mb__after_atomic(); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) + sock_put(&new_smc->sk); + } } out: - release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } @@ -2434,8 +2580,10 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { + int idx = atomic_fetch_inc(&lsmc->tcp_listen_work_seq) % + SMC_MAX_TCP_LISTEN_WORKS; sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_works[idx].work)) sock_put(&lsmc->sk); } out: @@ -2453,7 +2601,7 @@ static int smc_listen(struct socket *sock, int backlog) rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || - smc->connect_nonblock) + smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; @@ -2489,6 +2637,9 @@ static int smc_listen(struct socket *sock, int backlog) if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + if (smc->smc_fastopen && smc_clcsock_enable_fastopen(smc, /* is server */ 1)) + smc->smc_fastopen = 0; /* rollback when setsockopt failed */ + rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); @@ -2511,9 +2662,10 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *sk = sock->sk, *nsk; - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct smc_sock *lsmc; long timeo; + bool waited = false; int rc = 0; lsmc = smc_sk(sk); @@ -2526,17 +2678,19 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, goto out; } - /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { - set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } + /* Wait for an incoming connection */ + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + waited = true; release_sock(sk); - timeo = schedule_timeout(timeo); + if (smc_accept_queue_empty(sk)) + timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); @@ -2545,8 +2699,9 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, break; } } - set_current_state(TASK_RUNNING); - remove_wait_queue(sk_sleep(sk), &wait); + + if (waited) + finish_wait(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); @@ -2663,17 +2818,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return rc; } -static __poll_t smc_accept_poll(struct sock *parent) +static inline __poll_t smc_accept_poll(struct sock *parent) { - struct smc_sock *isk = smc_sk(parent); - __poll_t mask = 0; + if (!smc_accept_queue_empty(parent)) + return EPOLLIN | EPOLLRDNORM; - spin_lock(&isk->accept_q_lock); - if (!list_empty(&isk->accept_q)) - mask = EPOLLIN | EPOLLRDNORM; - spin_unlock(&isk->accept_q_lock); - - return mask; + return 0; } static __poll_t smc_poll(struct file *file, struct socket *sock, @@ -2745,6 +2895,17 @@ static int smc_shutdown(struct socket *sock, int how) lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if (sk->sk_state == SMC_ACTIVE) + sock->state = SS_CONNECTED; + else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || + sk->sk_state == SMC_PEERCLOSEWAIT2 || + sk->sk_state == SMC_APPCLOSEWAIT1 || + sk->sk_state == SMC_APPCLOSEWAIT2 || + sk->sk_state == SMC_APPFINCLOSEWAIT) + sock->state = SS_DISCONNECTING; + } + rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && @@ -2758,6 +2919,7 @@ static int smc_shutdown(struct socket *sock, int how) sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; @@ -2783,6 +2945,10 @@ static int smc_shutdown(struct socket *sock, int how) /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; + if (sk->sk_state == SMC_CLOSED) + sock->state = SS_UNCONNECTED; + else + sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; @@ -2870,9 +3036,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) @@ -2884,7 +3050,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; @@ -2950,19 +3116,19 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); - mutex_lock(&smc->clcsock_release_lock); + down_read(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); - mutex_unlock(&smc->clcsock_release_lock); + up_read(&smc->clcsock_release_lock); return rc; } @@ -3168,6 +3334,7 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, rc = -ENOBUFS; sock->ops = &smc_sock_ops; + sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; @@ -3331,7 +3498,7 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!smc_tcp_ls_wq) goto out_pnet; diff --git a/net/smc/smc.h b/net/smc/smc.h index 05864aeb790994bfae73215e3ba711798e1d633e..7e946c9e3099e8e48a320c2c30c8706720eac58c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -240,6 +240,13 @@ struct smc_connection { u8 out_of_sync : 1; /* out of sync with peer */ }; +#define SMC_MAX_TCP_LISTEN_WORKS 2 + +struct smc_tcp_listen_work { + struct smc_sock *smc; + struct work_struct work; +}; + struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ @@ -255,11 +262,17 @@ struct smc_sock { /* smc sock container */ struct smc_sock *listen_smc; /* listen parent */ bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ - struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct smc_tcp_listen_work tcp_listen_works[SMC_MAX_TCP_LISTEN_WORKS]; + /* handle tcp socket accepts */ + atomic_t tcp_listen_work_seq;/* used to select tcp_listen_works */ struct work_struct smc_listen_work;/* prepare new accept socket */ + struct work_struct free_work; /* free smc conn */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ + bool simplify_rkey_exhcange; /* simplify rkey exchange */ + /* enable SMC-R handshake proposal via tcp fastopen */ + bool smc_fastopen; bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ @@ -276,11 +289,16 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 smc_negotiated : 1; + /* whether the smc_sock + * was successfully negotiated + * via TCP options. + */ u8 connect_nonblock : 1; /* non-blocking connect in * flight */ - struct mutex clcsock_release_lock; + struct rw_semaphore clcsock_release_lock; /* protects clcsock of a listen * socket * */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index c469a0c67c3c1d09229e9452e397becfe28734b4..4a5b4f1f24b399683f5dd6e698ad0887b1f1d8b5 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -34,7 +34,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { - atomic_inc(&link->cdc_comp_cnt); diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); @@ -83,7 +82,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, /* abnormal termination */ if (!rc) smc_wr_tx_put_slot(link, - (struct smc_wr_tx_pend_priv *)pend); + (struct smc_wr_tx_pend_priv *)(*pend)); rc = -EPIPE; } return rc; @@ -122,7 +121,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); + if (smc_wr_rx_credits_need_announce_frequent(link)) + saved_credits = (u8)smc_wr_rx_get_credits(link); cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); @@ -132,12 +132,12 @@ int smc_cdc_msg_send(struct smc_connection *conn, if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; - atomic_inc(&link->cdc_send_cnt); } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_wr_rx_put_credits(link, saved_credits); - atomic_dec(&conn->cdc_pend_tx_wr); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) || smc_link_usable(conn->lnk)) + wake_up(&conn->cdc_pend_tx_wq); } return rc; @@ -169,8 +169,10 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn, smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (unlikely(rc)) - atomic_dec(&conn->cdc_pend_tx_wr); + if (unlikely(rc)) { + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) || smc_link_usable(conn->lnk)) + wake_up(&conn->cdc_pend_tx_wq); + } return rc; } @@ -231,7 +233,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) { - wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); + wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr) || + !smc_link_usable(conn->lnk) || conn->lgr->terminating); } /* Send a SMC-D CDC header. @@ -365,7 +368,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if ((diff_cons && smc_tx_prepared_sends(conn)) || + if ((diff_cons && smc_tx_prepared_sends(conn) && + conn->local_tx_ctrl.prod_flags.write_blocked) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || conn->local_rx_ctrl.prod_flags.urg_data_pending) { if (!sock_owned_by_user(&smc->sk)) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index bd07837d21d995ef5c5d3b7cf51e79439d9c4947..365831c683f12e82a8d55821d58be2e327255feb 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,7 +795,13 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; + down_read(&smc->clcsock_release_lock); + if (!smc->clcsock || !smc->clcsock->sk) { + up_read(&smc->clcsock_release_lock); + return -EPROTO; + } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + up_read(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; @@ -1034,7 +1040,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ETH_ALEN); hton24(clc->r0.qpn, link->roce_qp->qp_num); clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); switch (clc->hdr.type) { @@ -1049,8 +1055,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, break; } clc->r0.rmbe_size = conn->rmbe_size_short; - clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); if (version == SMC_V1) { clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index eb4bba54d6df77f3fc036959dc7077c0e9de478f..7b068f7e0519ab94f84f6ead7c17230cfc3a6da8 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -62,8 +62,8 @@ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ -#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 038bcafe9a9e9eb21efacbdf814365d9aadc85b9..74321f6b2230cc938d1d6a0218b42f6f305d14ab 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -25,9 +25,10 @@ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; - if (smc->listen_smc && current_work() != &smc->smc_listen_work) + if (smc->listen_smc && !smc->use_fallback && + current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); - mutex_lock(&smc->clcsock_release_lock); + down_write(&smc->clcsock_release_lock); /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; @@ -35,7 +36,7 @@ void smc_clcsock_release(struct smc_sock *smc) if (!smc->keep_clcsock) sock_release(tcp); } - mutex_unlock(&smc->clcsock_release_lock); + up_write(&smc->clcsock_release_lock); } static void smc_close_cleanup_listen(struct sock *parent) @@ -201,6 +202,7 @@ int smc_close_active(struct smc_sock *smc) long timeout; int rc = 0; int rc1 = 0; + int i = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? @@ -225,7 +227,8 @@ int smc_close_active(struct smc_sock *smc) } smc_close_cleanup_listen(sk); release_sock(sk); - flush_work(&smc->tcp_listen_work); + for (i = 0; i < SMC_MAX_TCP_LISTEN_WORKS; i++) + flush_work(&smc->tcp_listen_works[i].work); lock_sock(sk); break; case SMC_ACTIVE: diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9ccf9a432c3c06f90c0184892aa9a786768976c4..6d7ab53ead5ee738af4274f4777bcd19409e2664 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -46,6 +46,10 @@ struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; +struct smc_lgr_manager smc_lgr_manager = { + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_manager.lock), +}; + static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); @@ -55,6 +59,227 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); static void smc_link_down_work(struct work_struct *work); +/* SMC-R lgr cluster compare func */ +static int smcr_lnk_cluster_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct smc_lnk_cluster_compare_arg *key = arg->key; + const struct smc_lnk_cluster *lnkc = obj; + + if (memcmp(key->peer_systemid, lnkc->peer_systemid, SMC_SYSTEMID_LEN)) + return 1; + + if (memcmp(key->peer_gid, lnkc->peer_gid, SMC_GID_SIZE)) + return 1; + + if (key->smcr_version != SMC_V2 && memcmp(key->peer_mac, lnkc->peer_mac, ETH_ALEN)) + return 1; + + return 0; +} + +/* SMC-R lgr cluster hash func */ +static u32 smcr_lnk_cluster_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smc_lnk_cluster *lnkc = data; + + return jhash2((u32 *)lnkc->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed); +} + +/* SMC-R lgr cluster compare arg hash func */ +static u32 smcr_lnk_cluster_compare_arg_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smc_lnk_cluster_compare_arg *key = data; + + return jhash2((u32 *)key->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed); +} + +static const struct rhashtable_params smcr_lnk_cluster_rhl_params = { + .head_offset = offsetof(struct smc_lnk_cluster, rnode), + .key_len = sizeof(struct smc_lnk_cluster_compare_arg), + .obj_cmpfn = smcr_lnk_cluster_cmpfn, + .obj_hashfn = smcr_lnk_cluster_hashfn, + .hashfn = smcr_lnk_cluster_compare_arg_hashfn, + .automatic_shrinking = true, +}; + +/* hold a reference for smc_lnk_cluster */ +static inline struct smc_lnk_cluster *smc_lnk_cluster_hold(struct smc_lnk_cluster *lnkc) +{ + if (lnkc) + refcount_inc(&lnkc->ref); + return lnkc; +} + +/* release a reference for smc_lnk_cluster */ +static inline void smc_lnk_cluster_put(struct smc_lnk_cluster *lnkc) +{ + bool do_free = false; + + if (!lnkc) + return; + + if (refcount_dec_and_lock(&lnkc->ref, &smc_lgr_manager.lock)) { + do_free = true; + rhashtable_remove_fast(&smc_lgr_manager.lnk_cluster_maps, &lnkc->rnode, + smcr_lnk_cluster_rhl_params); + spin_unlock(&smc_lgr_manager.lock); + } + if (do_free) + kfree(lnkc); +} + +/* Get or create smc_lnk_cluster by key + * This function will hold a reference of returned smc_lnk_cluster + * or set refcount to one if have to create. + * caller MUST call smc_lnk_cluster_put after this. + */ +static inline struct smc_lnk_cluster * +smcr_lnk_get_or_create_cluster(struct smc_lnk_cluster_compare_arg *key) +{ + struct smc_lnk_cluster *lnkc; + int err; + + spin_lock(&smc_lgr_manager.lock); + lnkc = rhashtable_lookup_fast(&smc_lgr_manager.lnk_cluster_maps, key, + smcr_lnk_cluster_rhl_params); + if (!lnkc) { + lnkc = kzalloc(sizeof(*lnkc), GFP_ATOMIC); + if (unlikely(!lnkc)) + goto fail; /* decline */ + + /* init cluster */ + spin_lock_init(&lnkc->lock); + init_waitqueue_head(&lnkc->first_contact_waitqueue); + memcpy(lnkc->peer_systemid, key->peer_systemid, SMC_SYSTEMID_LEN); + memcpy(lnkc->peer_gid, key->peer_gid, SMC_GID_SIZE); + memcpy(lnkc->peer_mac, key->peer_mac, ETH_ALEN); + refcount_set(&lnkc->ref, 1); + + err = rhashtable_insert_fast(&smc_lgr_manager.lnk_cluster_maps, &lnkc->rnode, + smcr_lnk_cluster_rhl_params); + if (unlikely(err)) { + pr_warn_ratelimited("rhashtable_insert_fast failed"); + kfree(lnkc); + lnkc = NULL; + } + } else { + lnkc = smc_lnk_cluster_hold(lnkc); + } +fail: + spin_unlock(&smc_lgr_manager.lock); + return lnkc; +} + +/* caller MUST call smc_lnk_cluster_put after this. + */ +static inline struct smc_lnk_cluster *smcr_lnk_get_cluster(struct smc_link *lnk) +{ + struct smc_lnk_cluster_compare_arg key; + struct smc_link_group *lgr; + + lgr = lnk->lgr; + if (!lgr || lgr->is_smcd || lgr->role != SMC_SERV) + return NULL; + + key.smcr_version = lgr->smc_version; + key.peer_systemid = lgr->peer_systemid; + key.peer_gid = lnk->peer_gid; + key.peer_mac = lnk->peer_mac; + + return smcr_lnk_get_or_create_cluster(&key); +} + +/* caller MUST call smc_lnk_cluster_put after this. + */ +static inline struct smc_lnk_cluster * +smcr_lnk_get_cluster_by_ini(struct smc_init_info *ini, int role) +{ + struct smc_lnk_cluster_compare_arg key; + + if (ini->is_smcd || role != SMC_SERV) + return NULL; + + key.smcr_version = ini->smcr_version; + key.peer_systemid = ini->peer_systemid; + key.peer_gid = ini->peer_gid; + key.peer_mac = ini->peer_mac; + + return smcr_lnk_get_or_create_cluster(&key); +} + +/* callback when smc link state change */ +void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk, struct smc_init_info *ini) +{ + struct smc_lnk_cluster *lnkc; + int nr = 0; + + /* barrier for lnk->state */ + smp_wmb(); + + /* only first link & server can made connections block on + * first_contact_waitqueue + */ + if (lnk->link_idx != SMC_SINGLE_LINK || lnk->lgr->role != SMC_SERV) + return; + + /* state already seen */ + if (lnk->state_record & SMC_LNK_STATE_BIT(lnk->state)) + return; + + /* before smc_link_save_peer_info, we can not find lnkc + * by lnk + */ + lnkc = ini ? smcr_lnk_get_cluster_by_ini(ini, SMC_SERV) : + smcr_lnk_get_cluster(lnk); + + if (unlikely(!lnkc)) + return; + + spin_lock(&lnkc->lock); + + /* all lnk state change should be + * 1. SMC_LNK_UNUSED -> SMC_LNK_TEAR_DWON (link init failed) + * 2. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_TEAR_DWON + * 3. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_INACTIVE -> SMC_LNK_TEAR_DWON + * 4. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_INACTIVE -> SMC_LNK_TEAR_DWON + * 5. SMC_LNK_UNUSED -> SMC_LNK_ATIVATING -> SMC_LNK_ACTIVE ->SMC_LNK_INACTIVE + * -> SMC_LNK_TEAR_DWON + */ + switch (lnk->state) { + case SMC_LNK_ACTIVATING: + /* It's safe to hold a reference without lock + * dues to the smcr_lnk_get_cluster already hold one + */ + smc_lnk_cluster_hold(lnkc); + break; + case SMC_LNK_TEAR_DWON: + if (lnk->state_record & SMC_LNK_STATE_BIT(SMC_LNK_ACTIVATING)) + /* smc_lnk_cluster_hold in SMC_LNK_ACTIVATING */ + smc_lnk_cluster_put(lnkc); + fallthrough; + case SMC_LNK_ACTIVE: + case SMC_LNK_INACTIVE: + if (!(lnk->state_record & + (SMC_LNK_STATE_BIT(SMC_LNK_ACTIVE) + | SMC_LNK_STATE_BIT(SMC_LNK_INACTIVE)))) { + lnkc->pending_capability -= (SMC_RMBS_PER_LGR_MAX - 1); + /* TODO: wakeup just one to perfrom first contact + * if record state has no SMC_LNK_ACTIVE + */ + nr = SMC_RMBS_PER_LGR_MAX - 1; + } + break; + case SMC_LNK_UNUSED: + pr_warn_ratelimited("smc: invalid lnk state. "); + break; + } + SMC_LNK_STATE_RECORD(lnk, lnk->state); + spin_unlock(&lnkc->lock); + if (nr) + wake_up_nr(&lnkc->first_contact_waitqueue, nr); + smc_lnk_cluster_put(lnkc); /* smc_lnk_cluster_hold in smcr_lnk_get_cluster */ +} + /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) @@ -347,6 +572,8 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) + goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); @@ -646,8 +873,10 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (smc_link_sendable(lnk)) + if (smc_link_sendable(lnk)) { lnk->state = SMC_LNK_INACTIVE; + smcr_lnk_cluster_on_lnk_state(lnk, NULL); + } } wake_up_all(&lgr->llc_msg_waiter); wake_up_all(&lgr->llc_flow_waiter); @@ -772,18 +1001,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); - - atomic_set(&lnk->total_send_cnt, 0); - atomic_set(&lnk->total_comp_cnt, 0); - atomic_set(&lnk->reg_send_cnt, 0); - atomic_set(&lnk->reg_comp_cnt, 0); - atomic_set(&lnk->cdc_send_cnt, 0); - atomic_set(&lnk->cdc_comp_cnt, 0); - atomic_set(&lnk->llc_send_cnt, 0); - atomic_set(&lnk->llc_comp_cnt, 0); - atomic_set(&lnk->rdma_write_cnt, 0); - atomic_set(&lnk->bad_comp_cnt, 0); - smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!lnk->smcibdev->initialized) { @@ -816,6 +1033,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (rc) goto destroy_qp; lnk->state = SMC_LNK_ACTIVATING; + smcr_lnk_cluster_on_lnk_state(lnk, ini); return 0; destroy_qp: @@ -830,6 +1048,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; + lnk->state = SMC_LNK_TEAR_DWON; + smcr_lnk_cluster_on_lnk_state(lnk, ini); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) @@ -936,6 +1156,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; + lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; @@ -1115,34 +1336,37 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, return NULL; } -static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { + struct mutex *lock; /* lock buffer list */ int rc; - if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ mutex_lock(&lgr->llc_conf_mutex); - smc_llc_do_delete_rkey(lgr, rmb_desc); - rmb_desc->is_conf_rkey = false; + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; mutex_unlock(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } - if (rmb_desc->is_reg_err) { + if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - mutex_lock(&lgr->rmbs_lock); - list_del(&rmb_desc->list); - mutex_unlock(&lgr->rmbs_lock); + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + mutex_lock(lock); + list_del(&buf_desc->list); + mutex_unlock(lock); - smc_buf_free(lgr, true, rmb_desc); + smc_buf_free(lgr, is_rmb, buf_desc); } else { - rmb_desc->used = 0; - memset(rmb_desc->cpu_addr, 0, rmb_desc->len); + buf_desc->used = 0; + memset(buf_desc->cpu_addr, 0, buf_desc->len); } } @@ -1150,15 +1374,23 @@ static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) { - conn->sndbuf_desc->used = 0; - memset(conn->sndbuf_desc->cpu_addr, 0, conn->sndbuf_desc->len); + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + } else { + conn->sndbuf_desc->used = 0; + memset(conn->sndbuf_desc->cpu_addr, 0, + conn->sndbuf_desc->len); + } } - if (conn->rmb_desc && lgr->is_smcd) { - conn->rmb_desc->used = 0; - memset(conn->rmb_desc->cpu_addr, 0, conn->rmb_desc->len + - sizeof(struct smcd_cdc_msg)); - } else if (conn->rmb_desc) { - smcr_buf_unuse(conn->rmb_desc, lgr); + if (conn->rmb_desc) { + if (!lgr->is_smcd) { + smcr_buf_unuse(conn->rmb_desc, true, lgr); + } else { + conn->rmb_desc->used = 0; + memset(conn->rmb_desc->cpu_addr, 0, + conn->rmb_desc->len + + sizeof(struct smcd_cdc_msg)); + } } } @@ -1206,20 +1438,21 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - if (is_rmb) + if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) { - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - buf_desc->mr_rx[lnk->link_idx] = NULL; - } + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { + else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); - } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } @@ -1266,6 +1499,8 @@ static void __smcr_link_clear(struct smc_link *lnk) sock_release(lnk->clcsock); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; + lnk->state = SMC_LNK_TEAR_DWON; + smcr_lnk_cluster_on_lnk_state(lnk, NULL); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) @@ -1313,8 +1548,10 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); } - if (buf_desc->pages) + if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); kfree(buf_desc); } @@ -1477,6 +1714,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) /* cancel free_work sync, will terminate when lgr->freeing is set */ cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; + /* memory barrier */ + smp_wmb(); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -1486,6 +1725,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put below */ + /* try wakeup all */ + wake_up_all(&conn->cdc_pend_tx_wq); lock_sock(&smc->sk); smc_conn_kill(conn, soft); release_sock(&smc->sk); @@ -1732,6 +1973,7 @@ void smcr_link_down_cond(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); + smcr_lnk_cluster_on_lnk_state(lnk, NULL); smcr_link_down(lnk); } } @@ -1741,6 +1983,7 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); + smcr_lnk_cluster_on_lnk_state(lnk, NULL); schedule_work(&lnk->link_down_wrk); } } @@ -1884,11 +2127,13 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct net *net = sock_net(&smc->sk); + DECLARE_WAITQUEUE(wait, current); + struct smc_lnk_cluster *lnkc = NULL; struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; spinlock_t *lgr_lock; - int rc = 0; + int rc = 0, timeo = CLC_WAIT_TIME; lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : &smc_lgr_list.list; @@ -1896,12 +2141,20 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + + if (!ini->is_smcd && role == SMC_SERV) { + lnkc = smcr_lnk_get_cluster_by_ini(ini, role); + if (unlikely(!lnkc)) + return SMC_CLC_DECL_INTERR; + } + if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(lgr_lock); +again: list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? @@ -1916,21 +2169,52 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && - !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { + (SMC_RMBS_PER_LGR_MAX - + bitmap_weight(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) + > atomic_read(&lgr->rtoken_pendings))))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); - if (!rc && delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + if (!rc) { + smc_conn_enter_rtoken_pending(smc, ini); + if (delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); + } break; } write_unlock_bh(&lgr->conns_lock); } + if (lnkc && ini->first_contact_local) { + spin_lock(&lnkc->lock); + if (lnkc->pending_capability > lnkc->conns_pending) { + lnkc->conns_pending++; + spin_unlock(&lnkc->lock); + spin_unlock_bh(lgr_lock); + + add_wait_queue(&lnkc->first_contact_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + /* need to wait at least once first contact done */ + timeo = schedule_timeout(timeo); + set_current_state(TASK_RUNNING); + remove_wait_queue(&lnkc->first_contact_waitqueue, &wait); + spin_lock_bh(lgr_lock); + spin_lock(&lnkc->lock); + + lnkc->conns_pending--; + if (timeo) { + spin_unlock(&lnkc->lock); + goto again; + } + } + /* first_contact */ + lnkc->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1); + spin_unlock(&lnkc->lock); + } spin_unlock_bh(lgr_lock); if (rc) - return rc; + goto out; if (role == SMC_CLNT && !ini->first_contact_peer && ini->first_contact_local) { @@ -1938,7 +2222,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) * a new one * send out_of_sync decline, reason synchr. error */ - return SMC_CLC_DECL_SYNCERR; + rc = SMC_CLC_DECL_SYNCERR; + goto out; } create: @@ -1979,6 +2264,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) #endif out: + /* smc_lnk_cluster_hold in smcr_lnk_get_or_create_cluster */ + smc_lnk_cluster_put(lnkc); return rc; } @@ -2047,39 +2334,69 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -/* map an rmb buf to a link */ +/* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { - int rc; + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; - rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, buf_desc->len); + + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { + if (rc != nents) { rc = -EAGAIN; goto free_table; } - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, + buf_desc->is_dma_need_sync |= + smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; + + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; - smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + smc_ib_sync_sg_for_device(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } buf_desc->is_map_ib[lnk->link_idx] = true; return 0; @@ -2092,20 +2409,23 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, return rc; } -/* register a new rmb on IB device, +/* register a new buf on IB device, rmb or vzalloced sndbuf * must be called under lgr->llc_conf_mutex lock */ -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; - if (!rmb_desc->is_reg_mr[link->link_idx]) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->is_reg_err = true; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->is_reg_mr[link->link_idx] = true; + buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } @@ -2157,18 +2477,38 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; + /* reg all RMBs for a new link */ mutex_lock(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; - rc = smcr_link_reg_rmb(lnk, buf_desc); - if (rc) - goto out; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->rmbs_lock); + return rc; + } } } -out: mutex_unlock(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + mutex_lock(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->sndbufs_lock); + return rc; + } + } + } + mutex_unlock(&lgr->sndbufs_lock); return rc; } @@ -2182,18 +2522,39 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM); - buf_desc->order = get_order(bufsize); - buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_COMP | - __GFP_NORETRY | __GFP_ZERO, - buf_desc->order); - if (!buf_desc->pages) { - kfree(buf_desc); - return ERR_PTR(-EAGAIN); - } - buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); - buf_desc->len = bufsize; + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, @@ -2289,6 +2650,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { + buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ @@ -2323,7 +2685,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { - smcr_buf_unuse(buf_desc, lgr); + smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } @@ -2345,16 +2707,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) return 0; } -void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) -{ - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || - !smc_link_active(conn->lnk)) - return; - smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); -} - void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { + if (!conn->sndbuf_desc->is_dma_need_sync) + return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; @@ -2365,6 +2721,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; + if (!conn->rmb_desc->is_dma_need_sync) + return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { @@ -2375,20 +2733,6 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) } } -void smc_rmb_sync_sg_for_device(struct smc_connection *conn) -{ - int i; - - if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) - return; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_active(&conn->lgr->lnk[i])) - continue; - smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, - DMA_FROM_DEVICE); - } -} - /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, @@ -2580,6 +2924,8 @@ static struct notifier_block smc_reboot_notifier = { int __init smc_core_init(void) { + /* init smc lgr manager */ + rhashtable_init(&smc_lgr_manager.lnk_cluster_maps, &smcr_lnk_cluster_rhl_params); return register_reboot_notifier(&smc_reboot_notifier); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 3d8954ca0af1443da48be114873f36d22de85609..f54dd28e03426824d553c94c164683491ef60933 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,40 @@ struct smc_lgr_list { /* list of link group definition */ u32 num; /* unique link group number */ }; +struct smc_lgr_manager { /* manager for link group */ + struct rhashtable lnk_cluster_maps; /* maps of smc_lnk_cluster */ + spinlock_t lock; /* lock for lgr_cm_maps */ +}; + +struct smc_lnk_cluster { + struct rhash_head rnode; /* node for rhashtable */ + struct wait_queue_head first_contact_waitqueue; + /* queue for non first contact to wait + * first contact to be established. + */ + spinlock_t lock; /* protection for link group */ + refcount_t ref; /* refcount for cluster */ + unsigned long pending_capability; + /* maximum pending number of connections that + * need wait first contact complete. + */ + unsigned long conns_pending; + /* connections that are waiting for first contact + * complete + */ + u8 peer_systemid[SMC_SYSTEMID_LEN]; + u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ + u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ +}; + +struct smc_lnk_cluster_compare_arg /* key for smc_lnk_cluster */ +{ + int smcr_version; + u8 *peer_systemid; + u8 *peer_gid; + u8 *peer_mac; +}; + enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ @@ -44,8 +79,14 @@ enum smc_link_state { /* possible states of a link */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ + SMC_LNK_TEAR_DWON, /* link is tear down */ }; +#define SMC_LNK_STATE_BIT(state) (1 << (state)) + +#define SMC_LNK_STATE_RECORD(lnk, state) \ + ((lnk)->state_record |= SMC_LNK_STATE_BIT(state)) + #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ @@ -94,8 +135,7 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq_recv; /* cq for recv */ - struct smc_ib_cq *smcibcq_send; /* cq for send */ + struct smc_ib_cq *smcibcq; /* cq for recv & send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ @@ -138,6 +178,7 @@ struct smc_link { u8 credits_enable; /* credits enable flag, set when negotiation */ u8 local_cr_watermark_high; /* local rq credits watermark */ u8 peer_cr_watermark_low; /* peer rq credits watermark */ + u8 credits_update_limit; /* credits update limit for cdc msg */ struct work_struct credits_announce_work; /* work for credits announcement */ unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ @@ -163,22 +204,13 @@ struct smc_link { int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ + int state_record; /* record of previous state */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ struct socket *clcsock; /* keep for eRDMA */ - atomic_t total_send_cnt; - atomic_t total_comp_cnt; - atomic_t cdc_send_cnt; - atomic_t cdc_comp_cnt; - atomic_t llc_send_cnt; - atomic_t llc_comp_cnt; - atomic_t reg_send_cnt; - atomic_t reg_comp_cnt; - atomic_t rdma_write_cnt; - atomic_t bad_comp_cnt; }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -198,9 +230,11 @@ struct smc_buf_desc { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf * incl. rkey provided to peer + * and lkey provided to local */ u32 order; /* allocation order */ @@ -210,8 +244,11 @@ struct smc_buf_desc { /* mem region registered */ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; /* mem region mapped to lnk */ + u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ }; struct { /* SMC-D */ unsigned short sba_idx; @@ -246,6 +283,12 @@ enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; +enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, @@ -266,6 +309,9 @@ struct smc_link_group { struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ + atomic_t rtoken_pendings;/* number of connection that + * lgr assigned but no rtoken got yet + */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ @@ -307,6 +353,7 @@ struct smc_link_group { /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; + enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ @@ -543,10 +590,8 @@ void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey); -void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); -void smc_rmb_sync_sg_for_device(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); @@ -567,7 +612,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); -int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); @@ -577,6 +622,26 @@ int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +static inline void smc_conn_enter_rtoken_pending(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr; + + lgr = smc->conn.lgr; + if (lgr && !ini->first_contact_local) + atomic_inc(&lgr->rtoken_pendings); +} + +static inline void smc_conn_leave_rtoken_pending(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr; + + lgr = smc->conn.lgr; + if (lgr && !ini->first_contact_local) + atomic_dec(&lgr->rtoken_pendings); +} + +void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk, struct smc_init_info *ini); + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index c98e871b54c45a066559e0984cb93286cfd27476..b922009bf6b8b37d89341511d769b7bb57b2ea46 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,15 +131,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, - IB_CQ_SOLICITED_MASK); - if (rc) - goto out; - - rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc) - goto out; rc = smc_wr_rx_post_init(lnk); if (rc) @@ -630,21 +621,16 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, - bool is_send) +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev) { struct smc_ib_cq *smcibcq, *cq; int min, i; - if (is_send) - smcibcq = smcibdev->smcibcq_send; - else - smcibcq = smcibdev->smcibcq_recv; - + smcibcq = smcibdev->smcibcq; cq = smcibcq; min = cq->load; - for (i = 0; i < smcibdev->num_cq_peer; i++) { + for (i = 0; i < smcibdev->num_cq; i++) { if (smcibcq[i].load < min) { cq = &smcibcq[i]; min = cq->load; @@ -685,27 +671,22 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) { if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq_send); - smc_ib_put_cq(lnk->smcibcq_recv); + smc_ib_put_cq(lnk->smcibcq); } lnk->roce_qp = NULL; - lnk->smcibcq_send = NULL; - lnk->smcibcq_recv = NULL; + lnk->smcibcq = NULL; } /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, - true); - struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, - false); + struct smc_ib_cq *smcibcq = smc_ib_get_least_used_cq(lnk->smcibdev); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq_send->ib_cq, - .recv_cq = smcibcq_recv->ib_cq, + .send_cq = smcibcq->ib_cq, + .recv_cq = smcibcq->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -717,6 +698,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, + .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, @@ -734,8 +716,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; } else { - lnk->smcibcq_send = smcibcq_send; - lnk->smcibcq_recv = smcibcq_recv; + lnk->smcibcq = smcibcq; smc_wr_remember_qp_attr(lnk); } return rc; @@ -752,7 +733,7 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], buf_slot->sgt[link_idx].sgl, buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); @@ -764,25 +745,49 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[link_idx]) + if (buf_slot->mr[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[link_idx] = + buf_slot->mr[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[link_idx])) { + if (IS_ERR(buf_slot->mr[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[link_idx]); - buf_slot->mr_rx[link_idx] = NULL; + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) return -EINVAL; return 0; } +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot) +{ + struct scatterlist *sg; + unsigned int i; + bool ret = false; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { + if (!sg_dma_len(sg)) + break; + if (dma_need_sync(lnk->smcibdev->ibdev->dma_device, + sg_dma_address(sg))) { + ret = true; + goto out; + } + } + +out: + return ret; +} + /* synchronize buffer usage for cpu access */ void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, @@ -791,6 +796,9 @@ void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct scatterlist *sg; unsigned int i; + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { @@ -811,6 +819,9 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct scatterlist *sg; unsigned int i; + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { @@ -854,26 +865,72 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } -static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) +static const struct dim_cq_moder +smc_dim_profile[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +static void smc_ib_dim_work(struct work_struct *w) { - int i; + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - if (smcibdev->smcibcq_send[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); + u16 usec = smc_dim_profile[dim->profile_ix].usec; + u16 comps = smc_dim_profile[dim->profile_ix].comps; - if (smcibdev->smcibcq_recv[i].ib_cq) - ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); - } + dim->state = DIM_START_MEASURE; + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void smc_ib_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim) + return; + + dim = kzalloc(sizeof(*dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, smc_ib_dim_work); +} + +static void smc_ib_dim_destroy(struct ib_cq *cq) +{ + if (!cq->dim) + return; - kfree(smcibdev->smcibcq_send); - kfree(smcibdev->smcibcq_recv); + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); } -static void cq_event_handler(struct ib_event *event, void *data) +static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) { - pr_warn("smc: event %u (%s) data %p\n", - event->event, ib_event_msg(event->event), data); + int i; + + for (i = 0; i < smcibdev->num_cq; i++) { + if (smcibdev->smcibcq[i].ib_cq) { + smc_ib_dim_destroy(smcibdev->smcibcq[i].ib_cq); + ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); + } + } + + kfree(smcibdev->smcibcq); } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -881,7 +938,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; int cqe_size_order, smc_order; struct smc_ib_cq *smcibcq; - int i, num_cq_peer; + int i, num_cq; long rc; mutex_lock(&smcibdev->mutex); @@ -893,45 +950,32 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq_peer = num_cq_peer; - smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_send) { - rc = -ENOMEM; - goto err; - } - smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), - GFP_KERNEL); - if (!smcibdev->smcibcq_recv) { + num_cq = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq = num_cq; + smcibdev->smcibcq = kcalloc(num_cq, sizeof(*smcibcq), GFP_KERNEL); + if (!smcibdev->smcibcq) { rc = -ENOMEM; goto err; } /* initialize CQs */ - for (i = 0; i < num_cq_peer; i++) { - /* initialize send CQ */ - smcibcq = &smcibdev->smcibcq_send[i]; + for (i = 0; i < num_cq; i++) { + smcibcq = &smcibdev->smcibcq[i]; smcibcq->smcibdev = smcibdev; - smcibcq->is_send = 1; cqattr.comp_vector = i; smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, cq_event_handler, + smc_wr_cq_handler, NULL, smcibcq, &cqattr); rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) + if (IS_ERR(smcibcq->ib_cq)) { + smcibcq->ib_cq = NULL; goto err; + } - /* initialize recv CQ */ - smcibcq = &smcibdev->smcibcq_recv[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, cq_event_handler, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (IS_ERR(smcibcq->ib_cq)) + smc_ib_dim_init(smcibcq->ib_cq); + rc = ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP); + if (rc) goto err; } smc_wr_add_dev(smcibdev); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 1af83b5a2e7e0505a0edc0510a3bb13e5d918de5..906a6c57cac318515fc0f3d0cc5113c9919cef95 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -36,7 +37,6 @@ struct smc_ib_cq { /* ib_cq wrapper for smc */ struct smc_ib_device *smcibdev; /* parent ib device */ struct ib_cq *ib_cq; /* real ib_cq for link */ struct tasklet_struct tasklet; /* tasklet for wr */ - bool is_send; /* send for recv cq */ int load; /* load of current cq */ }; @@ -45,9 +45,8 @@ struct smc_ib_device { /* ib-device infos for smc */ struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq_peer; /* num of snd/rcv cq peer */ - struct smc_ib_cq *smcibcq_send; /* send cqs */ - struct smc_ib_cq *smcibcq_recv; /* recv cqs */ + int num_cq; /* num of snd/rcv cq */ + struct smc_ib_cq *smcibcq; /* send & recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; @@ -109,6 +108,8 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot); void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index d323b81f6d0410329ad1a5ff5bd53cb9558b44e5..5a23123275ba581a32d6b2b84e28bd3d128b1049 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -389,8 +389,6 @@ static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend, enum ib_wc_status wc_status) { /* future work: handle wc_status error for recovery and failover */ - if (!wc_status) - atomic_inc(&link->llc_comp_cnt); } /** @@ -513,19 +511,22 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, if (smc_link_active(link) && link != send_link) { rkeyllc->rtoken[rtok_ix].link_id = link->link_id; rkeyllc->rtoken[rtok_ix].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); - rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address( - rmb_desc->sgt[link->link_idx].sgl)); + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); rtok_ix++; } } /* rkey of send_link is in rtoken[0] */ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); - rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); put_out: @@ -552,7 +553,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY; smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc)); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -622,9 +623,10 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, if (!buf_pos) break; rmb = buf_pos; - ext->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - ext->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - ext->rt[i].rmb_vaddr_new = + ext->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + ext->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + ext->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); while (buf_pos && !(buf_pos)->used) @@ -900,9 +902,10 @@ static int smc_llc_add_link_cont(struct smc_link *link, } rmb = *buf_pos; - addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); - addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); - addc_llc->rt[i].rmb_vaddr_new = + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); (*num_rkeys_todo)--; @@ -1337,6 +1340,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr) return; /* no asymmetric link */ if (!smc_link_downing(&lnk_asym->state)) return; + smcr_lnk_cluster_on_lnk_state(lnk_asym, NULL); lnk_new = smc_switch_conns(lgr, lnk_asym, false); smc_wr_tx_wait_no_pending_sends(lnk_asym); if (!lnk_new) @@ -1556,6 +1560,7 @@ int smc_llc_srv_add_link(struct smc_link *link, out_err: if (link_new) { link_new->state = SMC_LNK_INACTIVE; + smcr_lnk_cluster_on_lnk_state(link_new, NULL); smcr_link_clear(link_new, false); } out: @@ -1666,8 +1671,10 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) del_llc->reason = 0; smc_llc_send_message(lnk, &qentry->msg); /* response */ - if (smc_link_downing(&lnk_del->state)) + if (smc_link_downing(&lnk_del->state)) { + smcr_lnk_cluster_on_lnk_state(lnk, NULL); smc_switch_conns(lgr, lnk_del, false); + } smcr_link_clear(lnk_del, true); active_links = smc_llc_active_link_count(lgr); @@ -1740,6 +1747,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) goto out; /* asymmetric link already deleted */ if (smc_link_downing(&lnk_del->state)) { + smcr_lnk_cluster_on_lnk_state(lnk, NULL); if (smc_switch_conns(lgr, lnk_del, false)) smc_wr_tx_wait_no_pending_sends(lnk_del); } @@ -2259,6 +2267,7 @@ void smc_llc_link_active(struct smc_link *link) schedule_delayed_work(&link->llc_testlink_wrk, link->llc_testlink_time); } + smcr_lnk_cluster_on_lnk_state(link, NULL); } /* called in worker context */ diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index d9c11b31c4ab9cf3daacd540bedc4ab05e55df9b..106887b7b9e1a668ae243fb6eaa829c577b5b846 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -243,11 +243,9 @@ static int proc_show_links(struct seq_file *seq, void *v) struct smc_link *lnk; int i = 0, j = 0; - seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s\n", + seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s\n", "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", - "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags", "total_send", - "total_comp", "cdc_send", "cdc_comp", "llc_send", "llc_comp", "reg_send", - "reg_comp", "bad_comp", "rdma_write"); + "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags"); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { @@ -257,24 +255,14 @@ static int proc_show_links(struct seq_file *seq, void *v) continue; for (j = 0; j < SMC_LGR_ID_SIZE; j++) seq_printf(seq, "%02X", lgr->id[j]); - seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u\n", + seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu\n", lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, atomic_read(&lnk->local_rq_credits), atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, - lnk->peer_cr_watermark_low, lnk->flags, - atomic_read(&lnk->total_send_cnt), - atomic_read(&lnk->total_comp_cnt), - atomic_read(&lnk->cdc_send_cnt), - atomic_read(&lnk->cdc_comp_cnt), - atomic_read(&lnk->llc_send_cnt), - atomic_read(&lnk->llc_comp_cnt), - atomic_read(&lnk->reg_send_cnt), - atomic_read(&lnk->reg_comp_cnt), - atomic_read(&lnk->bad_comp_cnt), - atomic_read(&lnk->rdma_write_cnt)); + lnk->peer_cr_watermark_low, lnk->flags); } } spin_unlock_bh(&smc_lgr_list.lock); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index bf353c68323d9670b4b6b96ae2b0664e5044be0e..4b548e1182689f03f5e4ed126f9ef972e6f04873 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -145,35 +145,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd, static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct smc_sock *smc) { + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); + struct partial_page *partial; struct splice_pipe_desc spd; - struct partial_page partial; - struct smc_spd_priv *priv; - int bytes; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); if (!priv) - return -ENOMEM; - priv->len = len; - priv->smc = smc; - partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; - partial.len = len; - partial.private = (unsigned long)priv; - - spd.nr_pages_max = 1; - spd.nr_pages = 1; - spd.pages = &smc->conn.rmb_desc->pages; - spd.partial = &partial; + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; bytes = splice_to_pipe(pipe, &spd); if (bytes > 0) { sock_hold(&smc->sk); - get_page(smc->conn.rmb_desc->pages); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } atomic_add(bytes, &smc->conn.splice_pending); } + kfree(priv); + kfree(partial); + kfree(pages); return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; } static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) @@ -355,12 +413,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } break; } + if (!timeo) + return -EAGAIN; if (signal_pending(current)) { read_done = sock_intr_errno(timeo); break; } - if (!timeo) - return -EAGAIN; } if (!smc_rx_data_available(conn)) { @@ -414,7 +472,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (rc < 0) { if (!read_done) read_done = -EFAULT; - smc_rmb_sync_sg_for_device(conn); goto out; } } @@ -428,7 +485,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in recv ring buffer */ } - smc_rmb_sync_sg_for_device(conn); /* update cursors */ if (!(flags & MSG_PEEK)) { diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 78f9895d649e3c85074f02be0d18bfa5adc4ea4d..09c585c69e7011a2b3e07b1b5a6738912f7bd056 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -15,12 +15,15 @@ #include #include "smc.h" +#include "smc_core.h" #include "smc_sysctl.h" #include "smc_core.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int two = 2; + static struct ctl_table smc_table[] = { { .procname = "autocorking_size", @@ -29,6 +32,15 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, + { + .procname = "smcr_buf_type", + .data = &init_net.smc.sysctl_smcr_buf_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, { .procname = "wmem_default", .data = &init_net.smc.sysctl_wmem_default, @@ -88,6 +100,33 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "simplify_rkey_exhcange", + .data = &init_net.smc.sysctl_simplify_rkey_exhcange, + .maxlen = sizeof(init_net.smc.sysctl_simplify_rkey_exhcange), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "fastopen", + .data = &init_net.smc.sysctl_smc_fastopen, + .maxlen = sizeof(init_net.smc.sysctl_smc_fastopen), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sysctl_smc_experiments", + .data = &init_net.smc.sysctl_smc_experiments, + .maxlen = sizeof(init_net.smc.sysctl_smc_experiments), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; @@ -112,13 +151,18 @@ int __net_init smc_sysctl_net_init(struct net *net) goto err_reg; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_wmem_default = 256 * 1024; net->smc.sysctl_rmem_default = 384 * 1024; net->smc.sysctl_tcp2smc = 0; net->smc.sysctl_allow_different_subnet = 1; net->smc.sysctl_keep_first_contact_clcsock = 1; net->smc.sysctl_disable_multiple_link = 1; - + /* default on */ + net->smc.sysctl_simplify_rkey_exhcange = 1; + net->smc.sysctl_smc_fastopen = 1; + /* default off */ + net->smc.sysctl_smc_experiments = 0; return 0; err_reg: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 20217edfb9e3f811fb6edc3dcd11f453df4cf784..c7beaa1f38d9c23d092ca93dc370cdb477592564 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -246,7 +246,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; - smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); @@ -367,8 +366,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); - else - atomic_inc(&link->rdma_write_cnt); return rc; } @@ -398,6 +395,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -405,13 +403,25 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { - struct ib_sge *sge = - wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; + struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; + struct ib_sge *sge = wr->wr.sg_list; + u64 base_addr = dma_addr; + + if (dst_len < link->qp_attr.cap.max_inline_data) { + base_addr = virt_addr; + wr->wr.send_flags |= IB_SEND_INLINE; + } else { + wr->wr.send_flags &= ~IB_SEND_INLINE; + } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; @@ -424,8 +434,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, - &wr_rdma_buf->wr_tx_rdma[dstchunk]); + rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index c36b7c3e1b4c74b9122b06e2d2efd49ca2d9fe5f..08b310ff5db68553571516f0485ac0bae4e7d12c 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -81,17 +81,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) u32 pnd_snd_idx; link = wc->qp->qp_context; - atomic_inc(&link->total_comp_cnt); if (wc->opcode == IB_WC_REG_MR) { - if (wc->status) { + if (wc->status) link->wr_reg_state = FAILED; - pr_warn("smc: reg mr comp failed\n"); - atomic_inc(&link->bad_comp_cnt); - } else { + else link->wr_reg_state = CONFIRMED; - atomic_inc(&link->reg_comp_cnt); - } smc_wr_wakeup_reg_wait(link); return; } @@ -99,10 +94,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); if (pnd_snd_idx == link->wr_tx_cnt) { if (link->lgr->smc_version != SMC_V2 || - link->wr_tx_v2_pend->wr_id != wc->wr_id) { - pr_warn("smc: find pending index failed\n"); + link->wr_tx_v2_pend->wr_id != wc->wr_id) return; - } link->wr_tx_v2_pend->wc_status = wc->status; memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); /* clear the full struct smc_wr_tx_pend including .priv */ @@ -121,14 +114,11 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_pends[pnd_snd_idx])); memset(&link->wr_tx_bufs[pnd_snd_idx], 0, sizeof(link->wr_tx_bufs[pnd_snd_idx])); - if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) { - pr_warn("smc: clear pending index bitmap failed\n"); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) return; - } } if (wc->status) { - atomic_inc(&link->bad_comp_cnt); if (link->lgr->smc_version == SMC_V2) { memset(link->wr_tx_v2_pend, 0, sizeof(*link->wr_tx_v2_pend)); @@ -144,42 +134,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) wake_up(&link->wr_tx_wait); } -static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) -{ - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); - struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i, rc; - -again: - do { - memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) - smc_wr_tx_process_cqe(&wc[i]); - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; - } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) - goto again; -} - -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) -{ - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - - tasklet_schedule(&smcibcq->tasklet); -} - /*---------------------------- request submission ---------------------------*/ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) @@ -330,8 +284,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); - } else { - atomic_inc(&link->total_send_cnt); } return rc; } @@ -342,14 +294,10 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibcq_send->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); - } else { - atomic_inc(&link->total_send_cnt); } return rc; } @@ -388,8 +336,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibcq_send->ib_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; link->wr_reg.mr = mr; @@ -397,8 +343,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); if (rc) return rc; - atomic_inc(&link->reg_send_cnt); - atomic_inc(&link->total_send_cnt); atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, @@ -470,56 +414,75 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) } } -static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) +static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) { - struct smc_link *link; - int i; + struct smc_link *link = wc->qp->qp_context; - for (i = 0; i < num; i++) { - link = wc[i].qp->qp_context; - if (wc[i].status == IB_WC_SUCCESS) { - link->wr_rx_tstamp = jiffies; - smc_wr_rx_demultiplex(&wc[i]); + if (wc->status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(wc); + smc_wr_rx_post(link); /* refill WR RX */ + } else { + /* handle status errors */ + switch (wc->status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); + break; + default: smc_wr_rx_post(link); /* refill WR RX */ - } else { - /* handle status errors */ - switch (wc[i].status) { - case IB_WC_RETRY_EXC_ERR: - case IB_WC_RNR_RETRY_EXC_ERR: - case IB_WC_WR_FLUSH_ERR: - smcr_link_down_cond_sched(link); - break; - default: - smc_wr_rx_post(link); /* refill WR RX */ - break; - } + break; } + } - if (smc_wr_rx_credits_need_announce(link) && - !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { - set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); - schedule_work(&link->credits_announce_work); - } + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); } } -static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) +int smc_wr_rx_post_init(struct smc_link *link) +{ + u32 i; + int rc = 0; + + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); + return rc; +} + +static void smc_wr_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int rc; + int i, rc, completed = 0; again: do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + for (i = 0; i < rc; i++) { + switch (wc[i].opcode) { + case IB_WC_REG_MR: + case IB_WC_SEND: + smc_wr_tx_process_cqe(&wc[i]); + break; + case IB_WC_RECV: + smc_wr_rx_process_cqe(&wc[i]); + break; + default: + pr_warn("smc: unexpected wc opcode %d, status %d, wr_id %llu.\n", + wc[i].opcode, wc[i].status, wc[i].wr_id); + break; + } + } + if (rc > 0) - smc_wr_rx_process_cqes(&wc[0], rc); - if (rc < SMC_WR_MAX_POLL_CQE) - /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been - * drained, no need to poll again. - */ - break; + completed += rc; } while (rc > 0); /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, @@ -530,27 +493,18 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; + + if (smcibcq->ib_cq->dim) + rdma_dim(smcibcq->ib_cq->dim, completed); } -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) { struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; tasklet_schedule(&smcibcq->tasklet); } -int smc_wr_rx_post_init(struct smc_link *link) -{ - u32 i; - int rc = 0; - - for (i = 0; i < link->wr_rx_cnt; i++) - rc = smc_wr_rx_post(link); - // credits have already been announced to peer - atomic_set(&link->local_rq_credits, 0); - return rc; -} - /***************************** init, exit, misc ******************************/ void smc_wr_remember_qp_attr(struct smc_link *lnk) @@ -589,10 +543,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; + bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; for (i = 0; i < lnk->wr_tx_cnt; i++) { - lnk->wr_tx_sges[i].addr = + lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) : lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; @@ -608,8 +563,9 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; - lnk->wr_tx_ibs[i].send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED; + if (send_inline) + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = @@ -627,8 +583,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; lnk->wr_tx_v2_ib->num_sge = 1; lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; - lnk->wr_tx_v2_ib->send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_v2_ib->send_flags = IB_SEND_SIGNALED; } /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. @@ -867,21 +822,17 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); - tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); - } + for (i = 0; i < smcibdev->num_cq; i++) + tasklet_kill(&smcibdev->smcibcq[i].tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { int i; - for (i = 0; i < smcibdev->num_cq_peer; i++) { - tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, - smc_wr_tx_tasklet_fn); - tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, - smc_wr_rx_tasklet_fn); + for (i = 0; i < smcibdev->num_cq; i++) { + tasklet_setup(&smcibdev->smcibcq[i].tasklet, + smc_wr_tasklet_fn); } } @@ -936,6 +887,11 @@ int smc_wr_create_link(struct smc_link *lnk) lnk->flags = 0; lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); lnk->peer_cr_watermark_low = 0; + + /* if credits accumlated less than 10% of wr_rx_cnt(at least 5), + * will not be announced by cdc msg. + */ + lnk->credits_update_limit = max(lnk->wr_rx_cnt / 10, 5U); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 8cf276215c91e80f472c3cbd01c397d3806f6343..ce338e1ca6c24e1b26fc6b9152992a1fd23fc266 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -133,6 +133,17 @@ static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; } +static inline int smc_wr_rx_credits_need_announce_frequent(struct smc_link *link) +{ + /* announce when local rq credits accumulated more than credits_update_limit, or + * peer rq credits is empty. As peer credits empty and local credits is less than + * credits_update_limit, may results in credits deadlock. + */ + return link->credits_enable && + (atomic_read(&link->local_rq_credits) >= link->credits_update_limit || + !atomic_read(&link->peer_rq_credits)); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -176,12 +187,11 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); #endif /* SMC_WR_H */