From 21e33596813bb4d22968aeb80b4f450d98d92151 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 14 Jun 2023 17:10:24 +0800 Subject: [PATCH 01/21] anolis: net/smc: Revert all Anolis patches for upstream backport ANBZ: #5534 This huge patch reverts all ANCK net/smc patches to backport upstream net/smc patches and prepare for ANCK 5.10-015. Expect for * a97c2e0 anolis: net/smc: Supplement for SMC-R iWARP support * 5cdf4d5 anolis: net/smc: Introduce iWARP device support * 24dd34f anolis: net/smc: Introduce iWARP extended information in struct smc_link which are needed by eRDMA. Signed-off-by: Wen Gu --- Documentation/networking/smc-sysctl.rst | 10 +- include/linux/btf_ids.h | 11 - include/net/net_namespace.h | 1 - include/net/netns/smc.h | 9 +- include/net/smc.h | 277 +------------- include/rdma/ib_verbs.h | 3 +- include/uapi/linux/in.h | 3 - include/uapi/linux/in6.h | 2 - include/uapi/linux/smc.h | 7 - include/uapi/rdma/ib_user_verbs.h | 1 - kernel/bpf/bpf_struct_ops_types.h | 4 - net/Makefile | 5 - net/smc/Makefile | 2 +- net/smc/af_smc.c | 181 ++-------- net/smc/bpf_smc_struct_ops.c | 152 -------- net/smc/smc.h | 229 +++++++++++- net/smc/smc_cdc.c | 9 +- net/smc/smc_clc.c | 6 - net/smc/smc_core.c | 153 +------- net/smc/smc_core.h | 25 +- net/smc/smc_diag.c | 29 +- net/smc/smc_dim.c | 250 ------------- net/smc/smc_dim.h | 36 -- net/smc/smc_ib.c | 274 ++------------ net/smc/smc_ib.h | 16 +- net/smc/smc_llc.c | 16 +- net/smc/smc_proc.c | 341 ------------------ net/smc/smc_proc.h | 34 -- net/smc/smc_stats.c | 46 --- net/smc/smc_stats.h | 47 --- net/smc/smc_sysctl.c | 31 +- net/smc/smc_tx.c | 6 - net/smc/smc_wr.c | 242 +++++++------ net/smc/smc_wr.h | 19 +- net/socket.c | 8 - .../selftests/bpf/prog_tests/bpf_smc.c | 37 -- tools/testing/selftests/bpf/progs/bpf_smc.c | 320 ---------------- 37 files changed, 496 insertions(+), 2346 deletions(-) delete mode 100644 net/smc/bpf_smc_struct_ops.c delete mode 100644 net/smc/smc_dim.c delete mode 100644 net/smc/smc_dim.h delete mode 100644 net/smc/smc_proc.c delete mode 100644 net/smc/smc_proc.h delete mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_smc.c delete mode 100644 tools/testing/selftests/bpf/progs/bpf_smc.c diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index a97284ad3c37..2c4b5c2181f7 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -44,16 +44,18 @@ smcr_testlink_time - INTEGER wmem - INTEGER Initial size of send buffer used by SMC sockets. + The default value inherits from net.ipv4.tcp_wmem[1]. - The minimum value is 256KiB and there is no hard limit for max value, but + The minimum value is 16KiB and there is no hard limit for max value, but only allowed 512KiB for SMC-R and 1MiB for SMC-D. - Default: 256K + Default: 16K rmem - INTEGER Initial size of receive buffer (RMB) used by SMC sockets. + The default value inherits from net.ipv4.tcp_rmem[1]. - The minimum value is 256KiB and there is no hard limit for max value, but + The minimum value is 16KiB and there is no hard limit for max value, but only allowed 512KiB for SMC-R and 1MiB for SMC-D. - Default: 256K + Default: 128K diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 632c31b67666..57890b357f85 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -184,15 +184,4 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif -#if IS_ENABLED(CONFIG_SMC) -enum { -#define BTF_SMC_TYPE(name, type) name, -BTF_SMC_TYPE(BTF_SMC_TYPE_SOCK, smc_sock) -BTF_SMC_TYPE(BTF_SMC_TYPE_CONNECTION, smc_connection) -#undef BTF_SMC_TYPE -MAX_BTF_SMC_TYPE -}; -extern u32 btf_smc_ids[]; -#endif - #endif diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index aef9f5b6b42e..d3b94dff556b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -96,7 +96,6 @@ struct net { struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; - struct proc_dir_entry *proc_net_smc; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index c19dee9b997c..582212ada3ba 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -4,8 +4,6 @@ #include #include -#define SMC_IWARP_RSVD_PORTS_NUM 16 /* must be 16 */ - struct smc_stats_rsn; struct smc_stats; struct netns_smc { @@ -14,9 +12,8 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; - int limit_smc_hs; /* constraint on handshake */ - atomic_t iwarp_cnt; - struct socket *rsvd_sock[SMC_IWARP_RSVD_PORTS_NUM]; + + bool limit_smc_hs; /* constraint on handshake */ #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif @@ -25,7 +22,5 @@ struct netns_smc { int sysctl_smcr_testlink_time; int sysctl_wmem; int sysctl_rmem; - int sysctl_tcp2smc; - int sysctl_allow_different_subnet; }; #endif diff --git a/include/net/smc.h b/include/net/smc.h index 08e635552105..421a7197b475 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -11,21 +11,11 @@ #ifndef _SMC_H #define _SMC_H -#include -#include - -#ifdef ATOMIC64_INIT -#define KERNEL_HAS_ATOMIC64 -#endif - #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ -#define SMC_HTABLE_SHIFT 9 -#define SMC_HTABLE_SIZE (1 << SMC_HTABLE_SHIFT) /* Size of SMC hashtable buckets */ struct smc_hashinfo { - unsigned int bkt_idx; rwlock_t lock; - struct hlist_head ht[SMC_HTABLE_SIZE]; + struct hlist_head ht; }; int smc_hash_sk(struct sock *sk); @@ -105,269 +95,4 @@ void smcd_unregister_dev(struct smcd_dev *smcd); void smcd_free_dev(struct smcd_dev *smcd); void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event); void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit, u16 dmbemask); - -struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ - union { - u8 type; -#if defined(__BIG_ENDIAN_BITFIELD) - struct { - u8 llc_version:4, - llc_type:4; - }; -#elif defined(__LITTLE_ENDIAN_BITFIELD) - struct { - u8 llc_type:4, - llc_version:4; - }; -#endif - }; -} __aligned(1); - -struct smc_cdc_conn_state_flags { -#if defined(__BIG_ENDIAN_BITFIELD) - u8 peer_done_writing : 1; /* Sending done indicator */ - u8 peer_conn_closed : 1; /* Peer connection closed indicator */ - u8 peer_conn_abort : 1; /* Abnormal close indicator */ - u8 reserved : 5; -#elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved : 5; - u8 peer_conn_abort : 1; - u8 peer_conn_closed : 1; - u8 peer_done_writing : 1; -#endif -}; - -struct smc_cdc_producer_flags { -#if defined(__BIG_ENDIAN_BITFIELD) - u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ - u8 urg_data_pending : 1; /* Urgent Data Pending */ - u8 urg_data_present : 1; /* Urgent Data Present */ - u8 cons_curs_upd_req : 1; /* cursor update requested */ - u8 failover_validation : 1;/* message replay due to failover */ - u8 reserved : 3; -#elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved : 3; - u8 failover_validation : 1; - u8 cons_curs_upd_req : 1; - u8 urg_data_present : 1; - u8 urg_data_pending : 1; - u8 write_blocked : 1; -#endif -}; - -/* in host byte order */ -union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ - struct { - u16 reserved; - u16 wrap; /* window wrap sequence number */ - u32 count; /* cursor (= offset) part */ - }; -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_t acurs; /* for atomic processing */ -#else - u64 acurs; /* for atomic processing */ -#endif -} __aligned(8); - -/* in host byte order, except for flag bitfields in network byte order */ -struct smc_host_cdc_msg { /* Connection Data Control message */ - struct smc_wr_rx_hdr common; /* .type = 0xFE */ - u8 len; /* length = 44 */ - u16 seqno; /* connection seq # */ - u32 token; /* alert_token */ - union smc_host_cursor prod; /* producer cursor */ - union smc_host_cursor cons; /* consumer cursor, - * piggy backed "ack" - */ - struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ - struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ - u8 reserved[18]; -} __aligned(8); - -enum smc_urg_state { - SMC_URG_VALID = 1, /* data present */ - SMC_URG_NOTYET = 2, /* data pending */ - SMC_URG_READ = 3, /* data was already read */ -}; - -struct smc_connection { - struct rb_node alert_node; - struct smc_link_group *lgr; /* link group of connection */ - struct smc_link *lnk; /* assigned SMC-R link */ - u32 alert_token_local; /* unique conn. id */ - u8 peer_rmbe_idx; /* from tcp handshake */ - int peer_rmbe_size; /* size of peer rx buffer */ - atomic_t peer_rmbe_space;/* remaining free bytes in peer - * rmbe - */ - int rtoken_idx; /* idx to peer RMB rkey/addr */ - - struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ - struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size_short;/* compressed notation */ - int rmbe_update_limit; - /* lower limit for consumer - * cursor update - */ - - struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging - * buffer for CDC msg send - * .prod cf. TCP snd_nxt - * .cons cf. TCP sends ack - */ - union smc_host_cursor local_tx_ctrl_fin; - /* prod crsr - confirmed by peer - */ - union smc_host_cursor tx_curs_prep; /* tx - prepared data - * snd_max..wmem_alloc - */ - union smc_host_cursor tx_curs_sent; /* tx - sent data - * snd_nxt ? - */ - union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer - * snd-wnd-begin ? - */ - atomic_t sndbuf_space; /* remaining space in sndbuf */ - u16 tx_cdc_seq; /* sequence # for CDC send */ - u16 tx_cdc_seq_fin; /* sequence # - tx completed */ - spinlock_t send_lock; /* protect wr_sends */ - atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe - * - inc when post wqe, - * - dec on polled tx cqe - */ - wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ - atomic_t tx_pushing; /* nr_threads trying tx push */ - struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ - u32 tx_off; /* base offset in peer rmb */ - - struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. - * .prod cf. TCP rcv_nxt - * .cons cf. TCP snd_una - */ - union smc_host_cursor rx_curs_confirmed; /* confirmed to peer - * source of snd_una ? - */ - union smc_host_cursor urg_curs; /* points at urgent byte */ - enum smc_urg_state urg_state; - bool urg_tx_pend; /* urgent data staged */ - bool urg_rx_skip_pend; - /* indicate urgent oob data - * read, but previous regular - * data still pending - */ - char urg_rx_byte; /* urgent byte */ - bool tx_in_release_sock; - /* flush pending tx data in - * sock release_cb() - */ - atomic_t bytes_to_rcv; /* arrived data, - * not yet received - */ - atomic_t splice_pending; /* number of spliced bytes - * pending processing - */ -#ifndef KERNEL_HAS_ATOMIC64 - spinlock_t acurs_lock; /* protect cursors */ -#endif - struct work_struct close_work; /* peer sent some closing */ - struct work_struct abort_work; /* abort the connection */ - struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ - u8 rx_off; /* receive offset: - * 0 for SMC-R, 32 for SMC-D - */ - u64 peer_token; /* SMC-D token of peer */ - u8 killed : 1; /* abnormal termination */ - u8 freed : 1; /* normal termiation */ - u8 out_of_sync : 1; /* out of sync with peer */ -}; - -struct smc_sock { /* smc sock container */ - struct sock sk; - struct socket *clcsock; /* internal tcp socket */ - void (*clcsk_state_change)(struct sock *sk); - /* original stat_change fct. */ - void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. */ - void (*clcsk_write_space)(struct sock *sk); - /* original write_space fct. */ - void (*clcsk_error_report)(struct sock *sk); - /* original error_report fct. */ - struct smc_connection conn; /* smc connection */ - struct smc_sock *listen_smc; /* listen parent */ - struct work_struct connect_work; /* handle non-blocking connect*/ - struct work_struct tcp_listen_work;/* handle tcp socket accepts */ - struct work_struct smc_listen_work;/* prepare new accept socket */ - struct list_head accept_q; /* sockets to be accepted */ - spinlock_t accept_q_lock; /* protects accept_q */ - bool limit_smc_hs; /* put constraint on handshake */ - bool use_fallback; /* fallback to tcp */ - int fallback_rsn; /* reason for fallback */ - u32 peer_diagnosis; /* decline reason from peer */ - atomic_t queued_smc_hs; /* queued smc handshakes */ - struct inet_connection_sock_af_ops af_ops; - const struct inet_connection_sock_af_ops *ori_af_ops; - /* original af ops */ - int sockopt_defer_accept; - /* sockopt TCP_DEFER_ACCEPT - * value - */ - u8 wait_close_tx_prepared : 1; - /* shutdown wr or close - * started, waiting for unsent - * data to be sent - */ - u8 smc_negotiated : 1; - /* whether the smc_sock - * was successfully negotiated - * via TCP options. - */ - u8 connect_nonblock : 1; - /* non-blocking connect in - * flight - */ - struct mutex clcsock_release_lock; - /* protects clcsock of a listen - * socket - */ -}; - -#define SMC_SOCK_CLOSED_TIMING (0) - -#ifdef CONFIG_BPF_SYSCALL - -/* BPF struct ops for smc protocol negotiator */ -struct smc_sock_negotiator_ops { - /* ret for negotiate */ - int (*negotiate)(struct smc_sock *sk); - - /* info gathering timing */ - void (*collect_info)(struct sock *sk, int timing); -}; - -/* Query if current sock should go with SMC protocol - * SK_PASS for yes, otherwise for no. - */ -int smc_sock_should_select_smc(const struct smc_sock *smc); - - -/* At some specific points in time, - * let negotiator can perform info gathering - * on target sock. - */ -void smc_sock_perform_collecting_info(const struct sock *sk, int timing); - -#else - -static inline int smc_sock_should_select_smc(const struct smc_sock *smc) -{ - return SK_PASS; -} - -static inline void smc_sock_perform_collecting_info(const struct sock *sk, int timing) -{ - -} - -#endif /* CONFIG_BPF_SYSCALL */ - #endif /* _SMC_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 489f91abeadd..a155f6d28ce2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1134,10 +1134,9 @@ enum ib_qp_create_flags { IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, + IB_QP_CREATE_IWARP_WITHOUT_CM = 1 << 25, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, - /* reserve for eRDMA OOB connection establishment */ - IB_QP_CREATE_IWARP_WITHOUT_CM = 1 << 27, IB_QP_CREATE_RESERVED_END = 1 << 31, }; diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 40b1e51b18c9..d1b327036ae4 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -84,9 +84,6 @@ enum { }; #endif -/* SMC protocol, IPv4 */ -#define SMCPROTO_SMC 0 - #if __UAPI_DEF_IN_ADDR /* Internet address. */ struct in_addr { diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 6c21c85be0e3..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -95,8 +95,6 @@ struct in6_flowlabel_req { #define IPV6_FL_S_USER 3 #define IPV6_FL_S_ANY 255 -/* SMC protocol, IPv6 */ -#define SMCPROTO_SMC6 1 /* * Bitmask constant declarations to help applications select out the diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index d9b5bd6cef85..bb4dacca31e7 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -142,13 +142,6 @@ enum { SMC_NLA_LINK_UID, /* u32 */ SMC_NLA_LINK_PEER_UID, /* u32 */ SMC_NLA_LINK_STATE, /* u32 */ - SMC_NLA_LINK_QPN, /* u32 */ - SMC_NLA_LINK_PEER_QPN, /* u32 */ - SMC_NLA_LINK_SWR_CNT, /* u64 */ - SMC_NLA_LINK_SWC_CNT, /* u64 */ - SMC_NLA_LINK_RWR_CNT, /* u64 */ - SMC_NLA_LINK_RWC_CNT, /* u64 */ - SMC_NLA_LINK_WWC_CNT, /* u64 */ __SMC_NLA_LINK_MAX, SMC_NLA_LINK_MAX = __SMC_NLA_LINK_MAX - 1 }; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index d4a18e5d7bcb..456438c18c2c 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -414,7 +414,6 @@ struct ib_uverbs_create_cq { enum ib_uverbs_ex_create_cq_flags { IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, - IB_UVERBS_CQ_FLAGS_LOCK_FREE = 1 << 6, }; struct ib_uverbs_ex_create_cq { diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index a4174169454f..066d83ea1c99 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -6,8 +6,4 @@ #include BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) #endif -#if IS_ENABLED(CONFIG_SMC) -#include -BPF_STRUCT_OPS_TYPE(smc_sock_negotiator_ops) -#endif #endif diff --git a/net/Makefile b/net/Makefile index d2f39c906b19..fc765b6ba067 100644 --- a/net/Makefile +++ b/net/Makefile @@ -53,11 +53,6 @@ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ obj-$(CONFIG_IUCV) += iucv/ obj-$(CONFIG_SMC) += smc/ -ifneq ($(CONFIG_SMC),) -ifeq ($(CONFIG_BPF_SYSCALL),y) -obj-y += smc/bpf_smc_struct_ops.o -endif -endif obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ obj-$(CONFIG_CAIF) += caif/ diff --git a/net/smc/Makefile b/net/smc/Makefile index 59a4f49f186a..875efcd126a2 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_proc.o smc_dim.o +smc-y += smc_tracepoint.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index aec9075684e3..130070425568 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -52,7 +52,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_proc.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -68,16 +67,6 @@ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -/* default use reserve_mode */ -bool reserve_mode = true; -module_param(reserve_mode, bool, 0444); -MODULE_PARM_DESC(reserve_mode, "reserve mode support and keep-first-contact disable"); - -/* rsvd_ports_base must less than (u16 MAX - 8) */ -u16 rsvd_ports_base = SMC_IWARP_RSVD_PORTS_BASE; -module_param(rsvd_ports_base, ushort, 0444); -MODULE_PARM_DESC(rsvd_ports_base, "base of rsvd ports for reserve_mode"); - int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -135,8 +124,6 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct sock *child; smc = smc_clcsock_user_data(sk); - if (unlikely(!smc)) - goto drop; if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) @@ -178,9 +165,6 @@ static bool smc_hs_congested(const struct sock *sk) if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) return true; - if (!smc_sock_should_select_smc(smc)) - return true; - return false; } @@ -197,13 +181,11 @@ int smc_hash_sk(struct sock *sk) struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; - write_lock_bh(&h->lock); - - head = &h->ht[h->bkt_idx++ & (SMC_HTABLE_SIZE - 1)]; + head = &h->ht; + write_lock_bh(&h->lock); sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock_bh(&h->lock); return 0; @@ -265,9 +247,6 @@ static void smc_fback_restore_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; - if (!clcsk) - return; - write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = NULL; @@ -340,16 +319,13 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); - /* trigger info gathering if needed.*/ - smc_sock_perform_collecting_info(sk, SMC_SOCK_CLOSED_TIMING); - old_state = sk->sk_state; /* cleanup for a dangling non-blocking connect */ if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - if (smc->connect_nonblock && cancel_work_sync(&smc->connect_work)) + if (cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) @@ -360,8 +336,8 @@ static int smc_release(struct socket *sock) else lock_sock(sk); - if ((old_state == SMC_INIT || smc->conn.killed) && - sk->sk_state == SMC_ACTIVE && !smc->use_fallback) + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) smc_close_active_abort(smc); rc = __smc_release(smc); @@ -733,11 +709,7 @@ static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc, struct smc_init_info *ini) { - struct smc_link_stats *lnk_stats = - &link->lgr->lnk_stats[link->link_idx]; - link->peer_qpn = ntoh24(clc->r0.qpn); - lnk_stats->peer_qpn = link->peer_qpn; memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); @@ -1654,20 +1626,8 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, break; } - if (!smc->clcsock || - (smc->clcsock && !smc->clcsock->sk)) { - rc = -EBADF; - goto out; - } smc_copy_sock_settings_to_clc(smc); - /* accept out connection as SMC connection */ - if (smc_sock_should_select_smc(smc) == SK_PASS) { - tcp_sk(smc->clcsock->sk)->syn_smc = 1; - } else { - tcp_sk(smc->clcsock->sk)->syn_smc = 0; - smc_switch_to_fallback(smc, /* just a chooice */ 0); - } - + tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { rc = -EALREADY; goto out; @@ -1729,12 +1689,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); - mutex_lock(&lsmc->clcsock_release_lock); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); - mutex_unlock(&lsmc->clcsock_release_lock); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; @@ -1772,8 +1730,8 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); - sk_acceptq_added(parent); spin_unlock(&par->accept_q_lock); + sk_acceptq_added(parent); } /* remove a socket from the accept queue of its parental listening socket */ @@ -1783,16 +1741,11 @@ static void smc_accept_unlink(struct sock *sk) spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); - sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); spin_unlock(&par->accept_q_lock); + sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } -static inline bool smc_accept_queue_empty(struct sock *sk) -{ - return list_empty(&smc_sk(sk)->accept_q); -} - /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ @@ -1900,7 +1853,7 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - if (new_smc->smc_negotiated) + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); if (lsmc->sk.sk_state == SMC_LISTEN) { @@ -2307,7 +2260,6 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct net *net = sock_net(&new_smc->sk); int prfx_rc; /* check for ISM device matching V2 proposed device */ @@ -2315,12 +2267,10 @@ static int smc_listen_find_device(struct smc_sock *new_smc, if (ini->ism_dev[0]) return 0; - if (!net->smc.sysctl_allow_different_subnet) { - /* check for matching IP prefix and subnet length (V1) */ - prfx_rc = smc_listen_prfx_check(new_smc, pclc); - if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); - } + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2487,8 +2437,7 @@ static void smc_listen_work(struct work_struct *work) } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); - if (newclcsock->sk) - SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: @@ -2517,12 +2466,8 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; - if (tcp_sk(new_smc->clcsock->sk)->syn_smc) { - new_smc->smc_negotiated = 1; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_inc(&lsmc->queued_smc_hs); - /* memory barrier */ - smp_mb__after_atomic(); - } new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; @@ -2781,12 +2726,17 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return rc; } -static inline __poll_t smc_accept_poll(struct sock *parent) +static __poll_t smc_accept_poll(struct sock *parent) { - if (!smc_accept_queue_empty(parent)) - return EPOLLIN | EPOLLRDNORM; + struct smc_sock *isk = smc_sk(parent); + __poll_t mask = 0; - return 0; + spin_lock(&isk->accept_q_lock); + if (!list_empty(&isk->accept_q)) + mask = EPOLLIN | EPOLLRDNORM; + spin_unlock(&isk->accept_q_lock); + + return mask; } static __poll_t smc_poll(struct file *file, struct socket *sock, @@ -3396,71 +3346,20 @@ static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { .clone = smc_ulp_clone, }; -static int smc_net_reserve_ports(struct net *net) -{ - struct smc_ib_device *smcibdev; - struct ib_device *ibdev; - int rc = 0; - - if (!reserve_mode) - return 0; - atomic_set(&net->smc.iwarp_cnt, 0); - memset(net->smc.rsvd_sock, 0, sizeof(net->smc.rsvd_sock)); - - mutex_lock(&smc_ib_devices.mutex); - list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { - ibdev = smcibdev->ibdev; - if (!smc_ib_is_iwarp(ibdev, 1)) - continue; - if (!rdma_dev_access_netns(ibdev, net)) - continue; - if (atomic_inc_return(&net->smc.iwarp_cnt) > 1) - continue; - /* first iwarp device */ - rc = smcr_iw_net_reserve_ports(net); - if (rc) { - atomic_set(&net->smc.iwarp_cnt, 0); - break; - } - } - mutex_unlock(&smc_ib_devices.mutex); - return rc; -} - -static void smc_net_release_ports(struct net *net) -{ - if (!reserve_mode) - return; - if (atomic_read(&net->smc.iwarp_cnt) && - net->smc.rsvd_sock[0]) - smcr_iw_net_release_ports(net); -} - unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { int rc; - rc = smc_net_reserve_ports(net); - if (rc) - return rc; rc = smc_sysctl_net_init(net); if (rc) - goto release_ports; - rc = smc_pnet_net_init(net); - if (rc) - goto release_ports; - return 0; - -release_ports: - smc_net_release_ports(net); - return rc; + return rc; + return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { - smc_net_release_ports(net); smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } @@ -3489,17 +3388,7 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { - int rc, i; - - if (reserve_mode) { - pr_info_ratelimited("smc: load SMC module with reserve_mode\n"); - if (rsvd_ports_base > - (U16_MAX - SMC_IWARP_RSVD_PORTS_NUM)) { - pr_info_ratelimited("smc: reserve_mode with invalid " - "ports base\n"); - return -EINVAL; - } - } + int rc; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3569,11 +3458,8 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - - for (i = 0; i < SMC_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht[i]); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); - } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { @@ -3587,17 +3473,9 @@ static int __init smc_init(void) goto out_ib; } - rc = smc_proc_init(); - if (rc) { - pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); - goto out_ulp; - } - static_branch_enable(&tcp_have_smc); return 0; -out_ulp: - tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3630,7 +3508,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); - smc_proc_exit(); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); diff --git a/net/smc/bpf_smc_struct_ops.c b/net/smc/bpf_smc_struct_ops.c deleted file mode 100644 index 15fd1b506a16..000000000000 --- a/net/smc/bpf_smc_struct_ops.c +++ /dev/null @@ -1,152 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include - -extern struct bpf_struct_ops smc_sock_negotiator_ops; - -DEFINE_RWLOCK(smc_sock_negotiator_ops_rwlock); -struct smc_sock_negotiator_ops *negotiator; - -/* convert sk to smc_sock */ -static inline struct smc_sock *smc_sk(const struct sock *sk) -{ - return (struct smc_sock *)sk; -} - -/* register ops */ -static inline void smc_reg_passive_sk_ops(struct smc_sock_negotiator_ops *ops) -{ - write_lock_bh(&smc_sock_negotiator_ops_rwlock); - negotiator = ops; - write_unlock_bh(&smc_sock_negotiator_ops_rwlock); -} - -/* unregister ops */ -static inline void smc_unreg_passive_sk_ops(struct smc_sock_negotiator_ops *ops) -{ - write_lock_bh(&smc_sock_negotiator_ops_rwlock); - if (negotiator == ops) - negotiator = NULL; - write_unlock_bh(&smc_sock_negotiator_ops_rwlock); -} - -int smc_sock_should_select_smc(const struct smc_sock *smc) -{ - int ret = SK_PASS; - - read_lock_bh(&smc_sock_negotiator_ops_rwlock); - if (negotiator && negotiator->negotiate) - ret = negotiator->negotiate((struct smc_sock *)smc); - read_unlock_bh(&smc_sock_negotiator_ops_rwlock); - return ret; -} -EXPORT_SYMBOL_GPL(smc_sock_should_select_smc); - -void smc_sock_perform_collecting_info(const struct sock *sk, int timing) -{ - read_lock_bh(&smc_sock_negotiator_ops_rwlock); - if (negotiator && negotiator->collect_info) - negotiator->collect_info((struct sock *)sk, timing); - read_unlock_bh(&smc_sock_negotiator_ops_rwlock); -} -EXPORT_SYMBOL_GPL(smc_sock_perform_collecting_info); - -/* define global smc ID for smc_struct_ops */ -BTF_ID_LIST_GLOBAL(btf_smc_ids) -#define BTF_SMC_TYPE(name, type) BTF_ID(struct, type) -BTF_SMC_TYPE(BTF_SMC_TYPE_SOCK, smc_sock) -BTF_SMC_TYPE(BTF_SMC_TYPE_CONNECTION, smc_connection) -#undef BTF_SMC_TYPE - -static int bpf_smc_passive_sk_init(struct btf *btf) -{ - return 0; -} - -/* register ops by BPF */ -static int bpf_smc_passive_sk_ops_reg(void *kdata) -{ - struct smc_sock_negotiator_ops *ops = kdata; - - /* at least one ops need implement */ - if (!ops->negotiate || !ops->collect_info) { - pr_err("At least one ops need implement.\n"); - return -EINVAL; - } - - smc_reg_passive_sk_ops(ops); - /* always success now */ - return 0; -} - -/* unregister ops by BPF */ -static void bpf_smc_passive_sk_ops_unreg(void *kdata) -{ - smc_unreg_passive_sk_ops(kdata); -} - -static int bpf_smc_passive_sk_ops_check_member(const struct btf_type *t, - const struct btf_member *member) -{ - return 0; -} - -static int bpf_smc_passive_sk_ops_init_member(const struct btf_type *t, - const struct btf_member *member, - void *kdata, const void *udata) -{ - return 0; -} - -static const struct bpf_func_proto * -smc_passive_sk_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return bpf_base_func_proto(func_id); -} - -static bool smc_passive_sk_ops_prog_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) - return false; - if (type != BPF_READ) - return false; - if (off % size != 0) - return false; - - return btf_ctx_access(off, size, type, prog, info); -} - -static int smc_passive_sk_ops_prog_struct_access(struct bpf_verifier_log *log, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id) -{ - if (atype == BPF_READ) - return btf_struct_access(log, t, off, size, atype, next_btf_id); - - return -EACCES; -} - -static const struct bpf_verifier_ops bpf_smc_passive_sk_verifier_ops = { - .get_func_proto = smc_passive_sk_prog_func_proto, - .is_valid_access = smc_passive_sk_ops_prog_is_valid_access, - .btf_struct_access = smc_passive_sk_ops_prog_struct_access -}; - -struct bpf_struct_ops bpf_smc_sock_negotiator_ops = { - .verifier_ops = &bpf_smc_passive_sk_verifier_ops, - .init = bpf_smc_passive_sk_init, - .check_member = bpf_smc_passive_sk_ops_check_member, - .init_member = bpf_smc_passive_sk_ops_init_member, - .reg = bpf_smc_passive_sk_ops_reg, - .unreg = bpf_smc_passive_sk_ops_unreg, - .name = "smc_sock_negotiator_ops", -}; diff --git a/net/smc/smc.h b/net/smc/smc.h index 1be0ce2c9b68..5ed765ea0c73 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -22,6 +22,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 + +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ + #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ @@ -30,8 +34,9 @@ extern struct proto smc_proto; extern struct proto smc_proto6; -extern bool reserve_mode; -extern u16 rsvd_ports_base; +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif enum smc_state { /* possible states of an SMC socket */ SMC_ACTIVE = 1, @@ -52,12 +57,232 @@ enum smc_state { /* possible states of an SMC socket */ struct smc_link_group; +struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ + union { + u8 type; +#if defined(__BIG_ENDIAN_BITFIELD) + struct { + u8 llc_version:4, + llc_type:4; + }; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + struct { + u8 llc_type:4, + llc_version:4; + }; +#endif + }; +} __aligned(1); + +struct smc_cdc_conn_state_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 peer_done_writing : 1; /* Sending done indicator */ + u8 peer_conn_closed : 1; /* Peer connection closed indicator */ + u8 peer_conn_abort : 1; /* Abnormal close indicator */ + u8 reserved : 5; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 5; + u8 peer_conn_abort : 1; + u8 peer_conn_closed : 1; + u8 peer_done_writing : 1; +#endif +}; + +struct smc_cdc_producer_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ + u8 urg_data_pending : 1; /* Urgent Data Pending */ + u8 urg_data_present : 1; /* Urgent Data Present */ + u8 cons_curs_upd_req : 1; /* cursor update requested */ + u8 failover_validation : 1;/* message replay due to failover */ + u8 reserved : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 3; + u8 failover_validation : 1; + u8 cons_curs_upd_req : 1; + u8 urg_data_present : 1; + u8 urg_data_pending : 1; + u8 write_blocked : 1; +#endif +}; + +/* in host byte order */ +union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ + struct { + u16 reserved; + u16 wrap; /* window wrap sequence number */ + u32 count; /* cursor (= offset) part */ + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in host byte order, except for flag bitfields in network byte order */ +struct smc_host_cdc_msg { /* Connection Data Control message */ + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* length = 44 */ + u16 seqno; /* connection seq # */ + u32 token; /* alert_token */ + union smc_host_cursor prod; /* producer cursor */ + union smc_host_cursor cons; /* consumer cursor, + * piggy backed "ack" + */ + struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ + struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ + u8 reserved[18]; +} __aligned(8); + +enum smc_urg_state { + SMC_URG_VALID = 1, /* data present */ + SMC_URG_NOTYET = 2, /* data pending */ + SMC_URG_READ = 3, /* data was already read */ +}; + struct smc_mark_woken { bool woken; void *key; wait_queue_entry_t wait_entry; }; +struct smc_connection { + struct rb_node alert_node; + struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ + u32 alert_token_local; /* unique conn. id */ + u8 peer_rmbe_idx; /* from tcp handshake */ + int peer_rmbe_size; /* size of peer rx buffer */ + atomic_t peer_rmbe_space;/* remaining free bytes in peer + * rmbe + */ + int rtoken_idx; /* idx to peer RMB rkey/addr */ + + struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ + struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ + int rmbe_size_short;/* compressed notation */ + int rmbe_update_limit; + /* lower limit for consumer + * cursor update + */ + + struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging + * buffer for CDC msg send + * .prod cf. TCP snd_nxt + * .cons cf. TCP sends ack + */ + union smc_host_cursor local_tx_ctrl_fin; + /* prod crsr - confirmed by peer + */ + union smc_host_cursor tx_curs_prep; /* tx - prepared data + * snd_max..wmem_alloc + */ + union smc_host_cursor tx_curs_sent; /* tx - sent data + * snd_nxt ? + */ + union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer + * snd-wnd-begin ? + */ + atomic_t sndbuf_space; /* remaining space in sndbuf */ + u16 tx_cdc_seq; /* sequence # for CDC send */ + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ + spinlock_t send_lock; /* protect wr_sends */ + atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe + * - inc when post wqe, + * - dec on polled tx cqe + */ + wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ + u32 tx_off; /* base offset in peer rmb */ + + struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. + * .prod cf. TCP rcv_nxt + * .cons cf. TCP snd_una + */ + union smc_host_cursor rx_curs_confirmed; /* confirmed to peer + * source of snd_una ? + */ + union smc_host_cursor urg_curs; /* points at urgent byte */ + enum smc_urg_state urg_state; + bool urg_tx_pend; /* urgent data staged */ + bool urg_rx_skip_pend; + /* indicate urgent oob data + * read, but previous regular + * data still pending + */ + char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ + atomic_t bytes_to_rcv; /* arrived data, + * not yet received + */ + atomic_t splice_pending; /* number of spliced bytes + * pending processing + */ +#ifndef KERNEL_HAS_ATOMIC64 + spinlock_t acurs_lock; /* protect cursors */ +#endif + struct work_struct close_work; /* peer sent some closing */ + struct work_struct abort_work; /* abort the connection */ + struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ + u8 rx_off; /* receive offset: + * 0 for SMC-R, 32 for SMC-D + */ + u64 peer_token; /* SMC-D token of peer */ + u8 killed : 1; /* abnormal termination */ + u8 freed : 1; /* normal termiation */ + u8 out_of_sync : 1; /* out of sync with peer */ +}; + +struct smc_sock { /* smc sock container */ + struct sock sk; + struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ + void (*clcsk_data_ready)(struct sock *sk); + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ + struct smc_connection conn; /* smc connection */ + struct smc_sock *listen_smc; /* listen parent */ + struct work_struct connect_work; /* handle non-blocking connect*/ + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct work_struct smc_listen_work;/* prepare new accept socket */ + struct list_head accept_q; /* sockets to be accepted */ + spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ + bool use_fallback; /* fallback to tcp */ + int fallback_rsn; /* reason for fallback */ + u32 peer_diagnosis; /* decline reason from peer */ + atomic_t queued_smc_hs; /* queued smc handshakes */ + struct inet_connection_sock_af_ops af_ops; + const struct inet_connection_sock_af_ops *ori_af_ops; + /* original af ops */ + int sockopt_defer_accept; + /* sockopt TCP_DEFER_ACCEPT + * value + */ + u8 wait_close_tx_prepared : 1; + /* shutdown wr or close + * started, waiting for unsent + * data to be sent + */ + u8 connect_nonblock : 1; + /* non-blocking connect in + * flight + */ + struct mutex clcsock_release_lock; + /* protects clcsock of a listen + * socket + * */ +}; + static inline struct smc_sock *smc_sk(const struct sock *sk) { return (struct smc_sock *)sk; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 1e806869e561..53f63bfbaf5f 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -28,15 +28,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, { struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; struct smc_connection *conn = cdcpend->conn; - struct smc_buf_desc *sndbuf_desc; struct smc_sock *smc; int diff; - sndbuf_desc = conn->sndbuf_desc; smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); - if (!wc_status && sndbuf_desc) { - diff = smc_curs_diff(sndbuf_desc->len, + if (!wc_status) { + diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); /* sndbuf_space is decreased in smc_sendmsg */ @@ -360,8 +358,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if ((diff_cons && smc_tx_prepared_sends(conn) && - conn->local_tx_ctrl.prod_flags.write_blocked) || + if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || conn->local_rx_ctrl.prod_flags.urg_data_pending) { if (!sock_owned_by_user(&smc->sk)) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 026a5078acfd..1472f31480d8 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -795,13 +795,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; vec.iov_len = send_len; - mutex_lock(&smc->clcsock_release_lock); - if (!smc->clcsock || !smc->clcsock->sk) { - mutex_unlock(&smc->clcsock_release_lock); - return -EPROTO; - } len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); - mutex_unlock(&smc->clcsock_release_lock); if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7c5c5ad81324..8bc8a4f15a9c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -379,13 +379,9 @@ static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, struct netlink_callback *cb) { char smc_ibname[IB_DEVICE_NAME_MAX]; - struct smc_link_stats *lnk_stats; - struct smc_link_ib_stats *stats; u8 smc_gid_target[41]; struct nlattr *attrs; u32 link_uid = 0; - int cpu, i, size; - u64 *src, *sum; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -426,43 +422,10 @@ static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, smc_gid_be16_convert(smc_gid_target, link->peer_gid); if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) goto errattr; - lnk_stats = &lgr->lnk_stats[link->link_idx]; - stats = kzalloc(sizeof(*stats), GFP_KERNEL); - if (!stats) - goto errattr; - size = sizeof(*stats) / sizeof(u64); - for_each_possible_cpu(cpu) { - src = (u64 *)per_cpu_ptr(lnk_stats->ib_stats, cpu); - sum = (u64 *)stats; - for (i = 0; i < size; i++) - *(sum++) += *(src++); - } - if (nla_put_u32(skb, SMC_NLA_LINK_QPN, lnk_stats->qpn)) - goto errstats; - if (nla_put_u32(skb, SMC_NLA_LINK_PEER_QPN, lnk_stats->peer_qpn)) - goto errstats; - if (nla_put_u64_64bit(skb, SMC_NLA_LINK_SWR_CNT, - stats->s_wr_cnt, SMC_NLA_LINK_UNSPEC)) - goto errstats; - if (nla_put_u64_64bit(skb, SMC_NLA_LINK_SWC_CNT, - stats->s_wc_cnt, SMC_NLA_LINK_UNSPEC)) - goto errstats; - if (nla_put_u64_64bit(skb, SMC_NLA_LINK_RWR_CNT, - stats->r_wr_cnt, SMC_NLA_LINK_UNSPEC)) - goto errstats; - if (nla_put_u64_64bit(skb, SMC_NLA_LINK_RWC_CNT, - stats->r_wc_cnt, SMC_NLA_LINK_UNSPEC)) - goto errstats; - if (nla_put_u64_64bit(skb, SMC_NLA_LINK_WWC_CNT, - stats->rw_wc_cnt, SMC_NLA_LINK_UNSPEC)) - goto errstats; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); - kfree(stats); return 0; -errstats: - kfree(stats); errattr: nla_nest_cancel(skb, attrs); errout: @@ -491,7 +454,7 @@ static int smc_nl_handle_lgr(struct smc_link_group *lgr, if (!list_links) goto out; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_UNUSED) + if (!smc_link_usable(&lgr->lnk[i])) continue; if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) goto errout; @@ -516,7 +479,7 @@ static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, int num = 0; spin_lock_bh(&smc_lgr->lock); - list_for_each_entry(lgr, &smc_lgr->list, stats_list) { + list_for_each_entry(lgr, &smc_lgr->list, list) { if (num < snum) goto next; if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) @@ -644,7 +607,7 @@ int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = false; - smc_nl_fill_lgr_list(&smc_lgr_stats_list, skb, cb, list_links); + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } @@ -652,7 +615,7 @@ int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = true; - smc_nl_fill_lgr_list(&smc_lgr_stats_list, skb, cb, list_links); + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } @@ -768,50 +731,6 @@ static void smcr_copy_dev_info_to_link(struct smc_link *link) link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; } -int smcr_iw_net_reserve_ports(struct net *net) -{ - int ports_base = rsvd_ports_base; - struct sockaddr_in laddr; - int rc = 0, i, j; - - for (i = 0; i < SMC_IWARP_RSVD_PORTS_NUM; i++) { - rc = __sock_create(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, - &net->smc.rsvd_sock[i], 1); - if (rc < 0) - goto release; - memset(&laddr, 0, sizeof(laddr)); - laddr.sin_port = htons(ports_base + i); - /* keep the rsvd ports */ - rc = kernel_bind(net->smc.rsvd_sock[i], (struct sockaddr *)&laddr, - sizeof(struct sockaddr_in)); - if (rc) { - sock_release(net->smc.rsvd_sock[i]); - net->smc.rsvd_sock[i] = NULL; - goto release; - } - } - pr_info_ratelimited("smc: netns %pK reserved ports for eRDMA OOB\n", net); - return 0; - -release: - for (j = 0; j < i; j++) { - sock_release(net->smc.rsvd_sock[j]); - net->smc.rsvd_sock[j] = NULL; - } - return rc; -} - -void smcr_iw_net_release_ports(struct net *net) -{ - int i; - - for (i = 0; i < SMC_IWARP_RSVD_PORTS_NUM; i++) { - sock_release(net->smc.rsvd_sock[i]); - net->smc.rsvd_sock[i] = NULL; - } - pr_info_ratelimited("smc: netns %pK released ports used by eRDMA OOB\n", net); -} - static void smcr_link_iw_extension(struct iw_ext_conn_param *iw_param, struct sock *clcsk) { iw_param->sk_addr.family = clcsk->sk_family; @@ -843,14 +762,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; } - - if (!lnk->smcibdev->ibdev) { - /* check if smcibdev still available */ - memset(lnk, 0, sizeof(struct smc_link)); - lnk->state = SMC_LNK_UNUSED; - return SMC_CLC_DECL_NOSMCRDEV; - } - get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ @@ -920,13 +831,11 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { - struct smc_ib_device *ibdev; struct smc_link_group *lgr; struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; - int ibport; int rc = 0; int i; @@ -980,6 +889,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ + struct smc_ib_device *ibdev; + int ibport; + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->smc_version = ini->smcr_version; memcpy(lgr->peer_systemid, ini->peer_systemid, @@ -995,22 +907,11 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) ibdev = ini->ib_dev; ibport = ini->ib_port; } - rc = smc_lgr_link_stats_init(lgr); - if (rc) - goto free_wq; - - mutex_lock(&smc_ib_devices.mutex); - if (list_empty(&ibdev->list) || - test_bit(ibport, ibdev->ports_going_away)) { - /* ibdev unavailable */ - rc = SMC_CLC_DECL_NOSMCRDEV; - goto free_stats; - } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); rc = smc_wr_alloc_lgr_mem(lgr); if (rc) - goto free_stats; + goto free_wq; smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; @@ -1020,7 +921,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { smc_wr_free_lgr_mem(lgr); - goto free_stats; + goto free_wq; } lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; @@ -1032,16 +933,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); - if (!ini->is_smcd) - mutex_unlock(&smc_ib_devices.mutex); return 0; -free_stats: - if (!ini->is_smcd) - smc_lgr_link_stats_free(lgr); free_wq: - if (!ini->is_smcd) - mutex_unlock(&smc_ib_devices.mutex); destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); @@ -1050,16 +944,10 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { - switch (rc) { - case -ENOMEM: + if (rc == -ENOMEM) rc = SMC_CLC_DECL_MEM; - break; - case SMC_CLC_DECL_NOSMCRDEV: - break; - default: + else rc = SMC_CLC_DECL_INTERR; - break; - } } return rc; } @@ -1247,9 +1135,8 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, smc_buf_free(lgr, is_rmb, buf_desc); } else { - if (is_rmb) - /* memzero_explicit provides potential memory barrier semantics */ - memzero_explicit(buf_desc->cpu_addr, buf_desc->len); + /* memzero_explicit provides potential memory barrier semantics */ + memzero_explicit(buf_desc->cpu_addr, buf_desc->len); WRITE_ONCE(buf_desc->used, 0); } } @@ -1261,6 +1148,7 @@ static void smc_buf_unuse(struct smc_connection *conn, if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { + memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); WRITE_ONCE(conn->sndbuf_desc->used, 0); } } @@ -1375,14 +1263,10 @@ static void __smcr_link_clear(struct smc_link *lnk) struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - smcr_buf_unmap_lgr(lnk); - smc_ib_destroy_queue_pair(lnk); - smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; - smcr_link_stats_clear(lnk); memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) @@ -1399,9 +1283,12 @@ void smcr_link_clear(struct smc_link *lnk, bool log) lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); smcr_link_put(lnk); /* theoretically last link_put */ } @@ -1421,11 +1308,8 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, { int i; - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_UNUSED) - continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - } if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); @@ -1495,7 +1379,6 @@ static void __smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } - smc_lgr_link_stats_free(lgr); kfree(lgr); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e0c258777ab4..975cdcc910b7 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -20,7 +20,6 @@ #include "smc.h" #include "smc_ib.h" -#include "smc_stats.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ @@ -88,7 +87,6 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ - struct smc_ib_cq *smcibcq; /* cq for recv & send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ @@ -116,6 +114,7 @@ struct smc_link { struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ @@ -246,7 +245,6 @@ struct smc_llc_flow { struct smc_link_group { struct list_head list; - struct list_head stats_list; struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ @@ -278,8 +276,8 @@ struct smc_link_group { /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ - struct smc_link_stats lnk_stats[SMC_LINKS_PER_LGR_MAX]; - /* smc link statistics */ + struct smc_wr_v2_buf *wr_rx_buf_v2; + /* WR v2 recv payload buffer */ struct smc_wr_v2_buf *wr_tx_buf_v2; /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; @@ -555,8 +553,6 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); -int smcr_iw_net_reserve_ports(struct net *net); -void smcr_iw_net_release_ports(struct net *net); int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); @@ -566,19 +562,4 @@ static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; } - -static inline void smcr_link_stats_clear(struct smc_link *link) -{ - struct smc_link_group *lgr = link->lgr; - struct smc_link_stats *lnk_stats; - int cpu; - - lnk_stats = &lgr->lnk_stats[link->link_idx]; - lnk_stats->qpn = 0; - lnk_stats->peer_qpn = 0; - for_each_possible_cpu(cpu) { - memset((u64 *)per_cpu_ptr(lnk_stats->ib_stats, cpu), 0, - sizeof(struct smc_link_ib_stats)); - } -} #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 6edc739f8e08..22d38206ed48 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -196,25 +196,24 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; - int rc = 0, num = 0, slot; + int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); - - for (slot = 0; slot < SMC_HTABLE_SIZE; slot++) { - head = &prot->h.smc_hash->ht[slot]; - - sk_for_each(sk, head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (num < snum) - goto next; - rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc < 0) - goto out; + head = &prot->h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; next: - num++; - } + num++; } out: diff --git a/net/smc/smc_dim.c b/net/smc/smc_dim.c deleted file mode 100644 index 9b13134adfe7..000000000000 --- a/net/smc/smc_dim.c +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2022, Alibaba Group. - * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. - */ - -#include -#include "smc_dim.h" - -#define SMC_IS_SIGNIFICANT_DIFF(val, ref, threshold) \ - ((ref) && (((100UL * abs((val) - (ref))) / (ref)) >= (threshold))) - -#define SMC_CPMS_THRESHOLD 5 -#define SMC_CPERATIO_THRESHOLD 25 -#define SMC_MAX_FLUCTUATIONS 3 -#define CPU_IDLE_UTIL_THRESHOLD 5 -#define CPU_SOFTIRQ_UTIL_THRESHOLD 10 - -#define SMC_DIM_PARAMS_NUM_PROFILES 4 -#define SMC_DIM_START_PROFILE 0 - -static const struct dim_cq_moder -smc_dim_profile[SMC_DIM_PARAMS_NUM_PROFILES] = { - {1, 0, 2, 0}, - {4, 0, 8, 0}, - {16, 0, 16, 0}, - {32, 0, 32, 0}, -}; - -static void smc_dim_work(struct work_struct *w) -{ - struct dim *dim = container_of(w, struct dim, work); - struct ib_cq *cq = dim->priv; - - u16 usec = smc_dim_profile[dim->profile_ix].usec; - u16 comps = smc_dim_profile[dim->profile_ix].comps; - - dim->state = DIM_START_MEASURE; - cq->device->ops.modify_cq(cq, comps, usec); -} - -void smc_dim_init(struct ib_cq *cq) -{ - struct smc_dim *smc_dim; - struct dim *dim; - - if (!cq->device->ops.modify_cq) - return; - - smc_dim = kzalloc(sizeof(*smc_dim), GFP_KERNEL); - if (!smc_dim) - return; - - smc_dim->use_dim = cq->device->use_cq_dim; - dim = to_dim(smc_dim); - dim->state = DIM_START_MEASURE; - dim->tune_state = DIM_GOING_RIGHT; - dim->profile_ix = SMC_DIM_START_PROFILE; - dim->priv = cq; - cq->dim = dim; - INIT_WORK(&dim->work, smc_dim_work); -} - -void smc_dim_destroy(struct ib_cq *cq) -{ - if (!cq->dim) - return; - - cancel_work_sync(&cq->dim->work); - kfree(cq->dim); -} - -static inline void smc_dim_param_clear(struct dim *dim) -{ - dim->steps_right = 0; - dim->steps_left = 0; - dim->tired = 0; - dim->profile_ix = SMC_DIM_START_PROFILE; - dim->tune_state = DIM_GOING_RIGHT; -} - -static inline void smc_dim_reset(struct dim *dim) -{ - int prev_ix = dim->profile_ix; - - smc_dim_param_clear(dim); - if (prev_ix != dim->profile_ix) - schedule_work(&dim->work); - else - dim->state = DIM_START_MEASURE; -} - -static int smc_dim_step(struct dim *dim) -{ - if (dim->tune_state == DIM_GOING_RIGHT) { - if (dim->profile_ix == (SMC_DIM_PARAMS_NUM_PROFILES - 1)) - return DIM_ON_EDGE; - dim->profile_ix++; - dim->steps_right++; - } - if (dim->tune_state == DIM_GOING_LEFT) { - if (dim->profile_ix == 0) - return DIM_ON_EDGE; - dim->profile_ix--; - dim->steps_left++; - } - - return DIM_STEPPED; -} - -static int smc_dim_stats_compare(struct dim_stats *curr, struct dim_stats *prev) -{ - /* first stat */ - if (!prev->cpms) - return DIM_STATS_BETTER; - - if (SMC_IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms, SMC_CPMS_THRESHOLD)) - return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER : - DIM_STATS_WORSE; - - if (SMC_IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio, SMC_CPERATIO_THRESHOLD)) - return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER : - DIM_STATS_WORSE; - - return DIM_STATS_SAME; -} - -static void smc_dim_exit_parking(struct dim *dim) -{ - dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT; - smc_dim_step(dim); - dim->tired = 0; -} - -static bool smc_dim_decision(struct dim_stats *curr_stats, struct dim *dim) -{ - int prev_state = dim->tune_state; - int prev_ix = dim->profile_ix; - int stats_res = smc_dim_stats_compare(curr_stats, - &dim->prev_stats); - - if (curr_stats->cpms < 50) { - smc_dim_param_clear(dim); - goto out; - } - - switch (dim->tune_state) { - case DIM_PARKING_ON_TOP: - if (stats_res != DIM_STATS_SAME) { - if (dim->tired++ > SMC_MAX_FLUCTUATIONS) - smc_dim_exit_parking(dim); - } else { - dim->tired = 0; - } - break; - case DIM_GOING_RIGHT: - case DIM_GOING_LEFT: - if (stats_res != DIM_STATS_BETTER) { - dim_turn(dim); - } else if (dim_on_top(dim)) { - dim_park_on_top(dim); - break; - } - - if (smc_dim_step(dim) == DIM_ON_EDGE) - dim_park_on_top(dim); - break; - } - -out: - if (prev_state != DIM_PARKING_ON_TOP || - dim->tune_state != DIM_PARKING_ON_TOP) - dim->prev_stats = *curr_stats; - - return dim->profile_ix != prev_ix; -} - -static bool smc_dim_check_utilization(struct dim *dim) -{ - struct smc_dim *smc_dim = to_smcdim(dim); - int cpu = smp_processor_id(); - struct kernel_cpustat kcpustat; - u32 idle_percent, softirq_percent; - u64 wall, wall_idle, diff_wall, softirq; - - wall_idle = get_cpu_idle_time(cpu, &wall, 1); - kcpustat_cpu_fetch(&kcpustat, cpu); - - softirq = div_u64(kcpustat_field(&kcpustat, CPUTIME_SOFTIRQ, cpu), NSEC_PER_USEC); - diff_wall = wall - smc_dim->prev_wall; - idle_percent = div64_u64(100 * (wall_idle - smc_dim->prev_idle), diff_wall); - softirq_percent = div64_u64(100 * (softirq - smc_dim->prev_softirq), diff_wall); - - smc_dim->prev_softirq = softirq; - smc_dim->prev_idle = wall_idle; - smc_dim->prev_wall = wall; - smc_dim->prev_idle_percent = idle_percent; - smc_dim->prev_si_percent = softirq_percent; - - return idle_percent < CPU_IDLE_UTIL_THRESHOLD && - softirq_percent >= CPU_SOFTIRQ_UTIL_THRESHOLD; -} - -void smc_dim(struct dim *dim, u64 completions) -{ - struct ib_cq *cq = dim->priv; - struct smc_dim *smc_dim = to_smcdim(dim); - struct dim_sample *curr_sample = &dim->measuring_sample; - struct dim_stats curr_stats; - u32 nevents; - - if (unlikely(smc_dim->use_dim != cq->device->use_cq_dim)) { - smc_dim->use_dim = cq->device->use_cq_dim; - if (!smc_dim->use_dim) - smc_dim_reset(dim); - } - - if (!smc_dim->use_dim) - return; - - dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0, - curr_sample->comp_ctr + completions, - &dim->measuring_sample); - - switch (dim->state) { - case DIM_MEASURE_IN_PROGRESS: - nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; - if (nevents < DIM_NEVENTS) - break; - if (!smc_dim_check_utilization(dim)) { - smc_dim_reset(dim); - break; - } - dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); - if (smc_dim_decision(&curr_stats, dim)) { - dim->state = DIM_APPLY_NEW_PROFILE; - schedule_work(&dim->work); - break; - } - fallthrough; - case DIM_START_MEASURE: - dim->state = DIM_MEASURE_IN_PROGRESS; - dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0, - curr_sample->comp_ctr, - &dim->start_sample); - break; - case DIM_APPLY_NEW_PROFILE: - break; - } -} diff --git a/net/smc/smc_dim.h b/net/smc/smc_dim.h deleted file mode 100644 index d844f2d27ef5..000000000000 --- a/net/smc/smc_dim.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2022, Alibaba Group. - */ - -#ifndef _SMC_DIM_H -#define _SMC_DIM_H - -#include -#include - -struct smc_dim { - struct dim dim; - bool use_dim; - u64 prev_idle; - u64 prev_softirq; - u64 prev_wall; - u32 prev_idle_percent; - u32 prev_si_percent; -}; - -static inline struct smc_dim *to_smcdim(struct dim *dim) -{ - return (struct smc_dim *)dim; -} - -static inline struct dim *to_dim(struct smc_dim *smcdim) -{ - return (struct dim *)smcdim; -} - -void smc_dim_init(struct ib_cq *cq); -void smc_dim_destroy(struct ib_cq *cq); -void smc_dim(struct dim *dim, u64 completions); - -#endif /* _SMC_DIM_H */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index a0833f6cd454..1cb600767e88 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -27,7 +27,6 @@ #include "smc_wr.h" #include "smc.h" #include "smc_netlink.h" -#include "smc_dim.h" #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ @@ -43,48 +42,6 @@ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ -static void smc_ib_modify_qp_iw_extension(struct smc_link *lnk) -{ - struct iw_ext_conn_param *iw_param = &lnk->iw_conn_param; - __be32 saddr_v4, daddr_v4; - bool use_rsvd_ports; - - /* IPs are stored as union, treat them as IPv4 - * for easy comparison. - */ - saddr_v4 = iw_param->sk_addr.saddr_v4; - daddr_v4 = iw_param->sk_addr.daddr_v4; - - if (saddr_v4 < daddr_v4) { - use_rsvd_ports = true; - } else if (saddr_v4 > daddr_v4) { - use_rsvd_ports = false; - } else { - /* if sip == dip, then lqpn must be - * different from rqpn. - */ - if (lnk->roce_qp->qp_num < lnk->peer_qpn) - use_rsvd_ports = true; - else - use_rsvd_ports = false; - } - - /* eRDMA iWARP MAX qp_num is 128K, that is a maximum of 128K - * RC links can be formed. So here we reserve 2^4 ports in - * one side, and with maximum of 2^16 ports in another side - * to form 2^20 different 5-tuples for eRDMA iWARP RC link. - */ - if (use_rsvd_ports) { - iw_param->sk_addr.sport = - rsvd_ports_base + ((lnk->peer_qpn >> 16) & 0xF); - iw_param->sk_addr.dport = htons(lnk->peer_qpn & 0xFFFF); - } else { - iw_param->sk_addr.sport = lnk->roce_qp->qp_num & 0xFFFF; - iw_param->sk_addr.dport = htons(rsvd_ports_base + - ((lnk->roce_qp->qp_num >> 16) & 0xF)); - } -} - static int smc_ib_modify_qp_init(struct smc_link *lnk) { struct ib_qp_attr qp_attr; @@ -130,12 +87,6 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) */ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; - if (reserve_mode && - smc_ib_is_iwarp(lnk->smcibdev->ibdev, lnk->ibport)) { - smc_ib_modify_qp_iw_extension(lnk); - qp_attr_mask |= IB_QP_RESERVED1; - } - return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); } @@ -180,7 +131,10 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + IB_CQ_SOLICITED_MASK); + if (rc) + goto out; rc = smc_wr_rx_post_init(lnk); if (rc) goto out; @@ -324,8 +278,8 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); - if (smc_ib_is_iwarp(smcibdev->ibdev, ibport) || - (!IS_ERR(ndev) && + if ((smcibdev->ibdev->port_data[ibport].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_IWARP) || (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(ndev)) || (vlan_id && is_vlan_dev(ndev) && vlan_dev_vlan_id(ndev) == vlan_id)))) { @@ -670,31 +624,6 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev) -{ - struct smc_ib_cq *smcibcq, *cq; - int min, i; - - smcibcq = smcibdev->smcibcq; - cq = smcibcq; - min = cq->load; - - for (i = 0; i < smcibdev->num_cq; i++) { - if (smcibcq[i].load < min) { - cq = &smcibcq[i]; - min = cq->load; - } - } - - cq->load++; - return cq; -} - -static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) -{ - smcibcq->load--; -} - static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -718,23 +647,20 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) { + if (lnk->roce_qp) ib_destroy_qp(lnk->roce_qp); - smc_ib_put_cq(lnk->smcibcq); - } lnk->roce_qp = NULL; - lnk->smcibcq = NULL; } /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - struct smc_ib_cq *smcibcq = smc_ib_get_least_used_cq(lnk->smcibdev); + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = smcibcq->ib_cq, - .recv_cq = smcibcq->ib_cq, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, @@ -743,26 +669,26 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = 1, + .max_recv_sge = sges_per_buf, .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, }; struct ib_device *ib_dev = lnk->smcibdev->ibdev; + struct ib_port_immutable immutable; int rc; - if (smc_ib_is_iwarp(ib_dev, lnk->ibport)) + ib_dev->ops.get_port_immutable(ib_dev, lnk->ibport, &immutable); + if (immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP) qp_attr.create_flags |= IB_QP_CREATE_IWARP_WITHOUT_CM; lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) { + if (IS_ERR(lnk->roce_qp)) lnk->roce_qp = NULL; - } else { - lnk->smcibcq = smcibcq; + else smc_wr_remember_qp_attr(lnk); - } return rc; } @@ -909,35 +835,12 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } -static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) -{ - int i; - - for (i = 0; i < smcibdev->num_cq; i++) { - if (smcibdev->smcibcq[i].ib_cq) { - smc_dim_destroy(smcibdev->smcibcq[i].ib_cq); - ib_destroy_cq(smcibdev->smcibcq[i].ib_cq); - } - } - smc_wr_remove_dev(smcibdev); - - kfree(smcibdev->smcibcq); -} - -static void cq_event_handler(struct ib_event *event, void *data) -{ - pr_warn_ratelimited("event %u (%s) data %p\n", - event->event, ib_event_msg(event->event), data); -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; + struct ib_cq_init_attr cqattr = { + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; int cqe_size_order, smc_order; - struct smc_ib_cq *smcibcq; - int i, num_cq; long rc; - u32 option_cqflag = IB_UVERBS_CQ_FLAGS_LOCK_FREE; mutex_lock(&smcibdev->mutex); rc = 0; @@ -948,48 +851,28 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - num_cq = min_t(int, smcibdev->ibdev->num_comp_vectors, - num_online_cpus()); - smcibdev->num_cq = num_cq; - smcibdev->smcibcq = kcalloc(num_cq, sizeof(*smcibcq), GFP_KERNEL); - if (!smcibdev->smcibcq) { - rc = -ENOMEM; - goto err; + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; + goto out; } - - /* initialize CQs */ - for (i = 0; i < num_cq; i++) { - smcibcq = &smcibdev->smcibcq[i]; - smcibcq->smcibdev = smcibdev; - cqattr.comp_vector = i; -again: - cqattr.flags |= option_cqflag; - smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, - smc_wr_cq_handler, cq_event_handler, - smcibcq, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); - if (rc == -EOPNOTSUPP) { - smcibcq->ib_cq = NULL; - cqattr.flags &= ~option_cqflag; - option_cqflag = 0; - goto again; - } - if (IS_ERR(smcibcq->ib_cq)) { - smcibcq->ib_cq = NULL; - goto err; - } - - smc_dim_init(smcibcq->ib_cq); - rc = ib_req_notify_cq(smcibcq->ib_cq, IB_CQ_NEXT_COMP); - if (rc) - goto err; + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; err: - smc_ib_cleanup_cq(smcibdev); + ib_destroy_cq(smcibdev->roce_cq_send); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -1001,7 +884,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - smc_ib_cleanup_cq(smcibdev); + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); } @@ -1051,91 +936,12 @@ void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) mutex_unlock(&smc_ib_devices.mutex); } -bool smc_ib_is_iwarp(struct ib_device *ibdev, u8 ibport) -{ - struct ib_port_immutable immutable; - - ibdev->ops.get_port_immutable(ibdev, ibport, &immutable); - return immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; -} - -/* Reserve socket ports of each net namespace which can be accessed - * by eRDMA (iWARP) device for out-bound RC establishment. - */ -static int smc_iw_reserve_ports(struct smc_ib_device *smcibdev) -{ - struct ib_device *ibdev = smcibdev->ibdev; - struct net *net, *_net; - int rc; - - if (!reserve_mode) - return 0; - if (!smc_ib_is_iwarp(ibdev, 1)) - return 0; - - down_read(&net_rwsem); - for_each_net(net) { - /* for net can access ibdev */ - if (!rdma_dev_access_netns(ibdev, net)) - continue; - /* check if already reserved*/ - if (atomic_inc_return(&net->smc.iwarp_cnt) > 1) - continue; - - rc = smcr_iw_net_reserve_ports(net); - if (rc) { - atomic_dec(&net->smc.iwarp_cnt); - goto release; - } - } - up_read(&net_rwsem); - return 0; - -release: - /* release ports and recover */ - for_each_net(_net) { - if (_net == net) - break; - if (!rdma_dev_access_netns(ibdev, _net)) - continue; - if (!atomic_dec_and_test(&_net->smc.iwarp_cnt)) - continue; - smcr_iw_net_release_ports(_net); - } - up_read(&net_rwsem); - return rc; -} - -static void smc_iw_release_ports(struct smc_ib_device *smcibdev) -{ - struct ib_device *ibdev = smcibdev->ibdev; - struct net *net; - - if (!reserve_mode) - return; - if (!smc_ib_is_iwarp(ibdev, 1)) - return; - - down_read(&net_rwsem); - for_each_net(net) { - /* for net can access ibdev */ - if (!rdma_dev_access_netns(ibdev, net)) - continue; - /* check if need release */ - if (!atomic_dec_and_test(&net->smc.iwarp_cnt)) - continue; - - smcr_iw_net_release_ports(net); - } - up_read(&net_rwsem); -} - /* callback function for ib_register_client() */ static int smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; - int i, rc = 0; u8 port_cnt; + int i; if (ibdev->node_type != RDMA_NODE_IB_CA && ibdev->node_type != RDMA_NODE_RNIC) @@ -1146,11 +952,6 @@ static int smc_ib_add_dev(struct ib_device *ibdev) return -ENOMEM; smcibdev->ibdev = ibdev; - rc = smc_iw_reserve_ports(smcibdev); - if (rc) { - kfree(smcibdev); - return rc; - } INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); atomic_set(&smcibdev->lnk_cnt, 0); init_waitqueue_head(&smcibdev->lnks_deleted); @@ -1199,7 +1000,6 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) pr_warn_ratelimited("smc: removing ib device %s\n", smcibdev->ibdev->name); smc_smcr_terminate_all(smcibdev); - smc_iw_release_ports(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); cancel_work_sync(&smcibdev->port_event_work); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 709c383e776a..034295676e88 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -23,7 +23,6 @@ #define SMC_GID_SIZE sizeof(union ib_gid) #define SMC_IB_MAX_SEND_SGE 2 -#define SMC_IWARP_RSVD_PORTS_BASE 33800 struct smc_ib_devices { /* list of smc ib devices definition */ struct list_head list; @@ -32,22 +31,16 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ -extern struct smc_lgr_list smc_lgr_stats_list; /* list of statistic linkgroups */ - -struct smc_ib_cq { /* ib_cq wrapper for smc */ - struct smc_ib_device *smcibdev; /* parent ib device */ - struct ib_cq *ib_cq; /* real ib_cq for link */ - struct tasklet_struct tasklet; /* tasklet for wr */ - int load; /* load of current cq */ -}; struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - int num_cq; /* num of snd/rcv cq */ - struct smc_ib_cq *smcibcq; /* send & recv cqs */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; @@ -123,6 +116,5 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, int smc_ib_find_route(__be32 saddr, __be32 daddr, u8 nexthop_mac[], u8 *uses_gateway); bool smc_ib_is_valid_local_systemid(void); -bool smc_ib_is_iwarp(struct ib_device *ibdev, u8 ibport); int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index bc1304b243ab..a09fe34d94eb 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -988,14 +988,13 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } static void smc_llc_save_add_link_rkeys(struct smc_link *link, - struct smc_link *link_new, - void *llc_msg) + struct smc_link *link_new) { struct smc_llc_msg_add_link_v2_ext *ext; struct smc_link_group *lgr = link->lgr; int max, i; - ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)llc_msg + + ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); down_write(&lgr->rmbs_lock); @@ -1011,11 +1010,7 @@ static void smc_llc_save_add_link_rkeys(struct smc_link *link, static void smc_llc_save_add_link_info(struct smc_link *link, struct smc_llc_msg_add_link *add_llc) { - struct smc_link_stats *lnk_stats = - &link->lgr->lnk_stats[link->link_idx]; - link->peer_qpn = ntoh24(add_llc->sender_qp_num); - lnk_stats->peer_qpn = link->peer_qpn; memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE); memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); @@ -1090,7 +1085,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_clear_lnk; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, lnk_new, (void *)llc); + smc_llc_save_add_link_rkeys(link, lnk_new); } else { rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { @@ -1483,7 +1478,7 @@ int smc_llc_srv_add_link(struct smc_link *link, if (rc) goto out_err; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, link_new, (void *)add_llc); + smc_llc_save_add_link_rkeys(link, link_new); } else { rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) @@ -1792,7 +1787,8 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) if (lgr->smc_version == SMC_V2) { struct smc_llc_msg_delete_rkey_v2 *llcv2; - llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc; + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; llcv2->num_inval_rkeys = 0; max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c deleted file mode 100644 index 43cb5c6cd6b8..000000000000 --- a/net/smc/smc_proc.c +++ /dev/null @@ -1,341 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include "smc.h" -#include "smc_proc.h" -#include "smc_core.h" -#include "smc_dim.h" - -static void *smc_get_next(struct seq_file *seq, void *cur) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - struct net *net = seq_file_net(seq); - struct hlist_head *head; - struct sock *sk = cur; - - if (!sk) { - read_lock(&smc_hash->lock); -get_head: - head = &smc_hash->ht[sp->bucket]; - sk = sk_head(head); - sp->offset = 0; - goto get_sk; - } - ++sp->num; - ++sp->offset; - - sk = sk_next(sk); -get_sk: - sk_for_each_from(sk) { - if (!net_eq(sock_net(sk), net)) - continue; - return sk; - } - sp->offset = 0; - if (++sp->bucket < SMC_HTABLE_SIZE) - goto get_head; - - read_unlock(&smc_hash->lock); - return NULL; -} - -static void *smc_seek_last_pos(struct seq_file *seq) -{ - struct smc_proc_private *sp = seq->private; - int offset = sp->offset; - int orig_num = sp->num; - void *rc = NULL; - - if (sp->bucket >= SMC_HTABLE_SIZE) - goto out; - - rc = smc_get_next(seq, NULL); - while (offset-- && rc) - rc = smc_get_next(seq, rc); - - if (rc) - goto out; - - sp->bucket = 0; -out: - sp->num = orig_num; - return rc; -} - -static void *smc_get_idx(struct seq_file *seq, loff_t pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - sp->bucket = 0; - rc = smc_get_next(seq, NULL); - - while (rc && pos) { - rc = smc_get_next(seq, rc); - --pos; - } - return rc; -} - -static void *_smc_conn_start(struct seq_file *seq, loff_t *pos, int protocol) -{ - struct smc_proc_private *sp = seq->private; - void *rc; - - if (*pos && *pos == sp->last_pos) { - rc = smc_seek_last_pos(seq); - if (rc) - goto out; - } - - sp->num = 0; - sp->bucket = 0; - sp->offset = 0; - sp->protocol = protocol; - rc = *pos ? smc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - -out: - sp->last_pos = *pos; - return rc; -} - -static void *smc_conn4_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC); -} - -static void *smc_conn6_start(struct seq_file *seq, loff_t *pos) -{ - return _smc_conn_start(seq, pos, SMCPROTO_SMC6); -} - -static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) -{ - struct smc_proc_private *sp = seq->private; - const struct in6_addr *dest, *src; - struct smc_link_group *lgr; - struct socket *clcsock; - struct smc_link *lnk; - struct sock *sk; - bool fb = false; - int i; - - fb = smc->use_fallback; - clcsock = smc->clcsock; - sk = &smc->sk; - - if (protocol == SMCPROTO_SMC) - seq_printf(seq, CONN4_ADDR_FM, sp->num, - clcsock->sk->sk_rcv_saddr, clcsock->sk->sk_num, - clcsock->sk->sk_daddr, ntohs(clcsock->sk->sk_dport)); - else if (protocol == SMCPROTO_SMC6) { - dest = &clcsock->sk->sk_v6_daddr; - src = &clcsock->sk->sk_v6_rcv_saddr; - seq_printf(seq, CONN6_ADDR_FM, sp->num, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], clcsock->sk->sk_num, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], ntohs(clcsock->sk->sk_dport)); - } - - seq_printf(seq, CONN_SK_FM, fb ? 'Y' : 'N', fb ? smc->fallback_rsn : 0, - sk, clcsock->sk, fb ? clcsock->sk->sk_state : sk->sk_state, sock_i_ino(sk)); - - lgr = smc->conn.lgr; - lnk = smc->conn.lnk; - - if (!fb && sk->sk_state == SMC_ACTIVE && lgr && lnk) { - for (i = 0; i < SMC_LGR_ID_SIZE; i++) - seq_printf(seq, "%02X", lgr->id[i]); - - seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', - lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); - } else { - seq_puts(seq, "- - - - - - - -\n"); - } -} - -static int smc_conn_show(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct socket *clcsock; - struct smc_sock *smc; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, - "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", - "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); - goto out; - } - - smc = smc_sk(v); - clcsock = smc->clcsock; - if (!clcsock) - goto out; - - _conn_show(seq, smc, sp->protocol); -out: - return 0; -} - -static void *smc_conn_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct smc_proc_private *sp = seq->private; - void *rc = NULL; - - if (v == SEQ_START_TOKEN) { - rc = smc_get_idx(seq, 0); - goto out; - } - rc = smc_get_next(seq, v); -out: - ++*pos; - sp->last_pos = *pos; - return rc; -} - -static void smc_conn_stop(struct seq_file *seq, void *v) -{ - struct smc_proc_private *sp = seq->private; - struct smc_hashinfo *smc_hash = - sp->protocol == SMCPROTO_SMC ? - smc_proto.h.smc_hash : smc_proto6.h.smc_hash; - - if (v && v != SEQ_START_TOKEN) - read_unlock(&smc_hash->lock); -} - -static struct smc_proc_entry smc_proc[] = { - { - .name = "smc4", - .ops = { - .show = smc_conn_show, - .start = smc_conn4_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#if IS_ENABLED(CONFIG_IPV6) - { - .name = "smc6", - .ops = { - .show = smc_conn_show, - .start = smc_conn6_start, - .next = smc_conn_next, - .stop = smc_conn_stop, - }, - }, -#endif -}; - -static int proc_show_dim(struct seq_file *seq, void *v) -{ - static const char * const state_str[] = {"park", "tired", "right", "left"}; - struct smc_ib_device *ibdev; - int i = 0; - - seq_printf(seq, "%-9s%-6s%-6s%-6s%-6s%-6s%-6s%-6s%-6s%-6s%-8s%-8s%-8s\n", - "dev", "idx", "dim", "idle", "si", "ix", "state", "left", - "right", "tired", "cpms", "epms", "cpe"); - - mutex_lock(&smc_ib_devices.mutex); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 0; i < ibdev->num_cq; i++) { - seq_printf(seq, "%-9s%-6d", ibdev->ibdev->name, i); - if (ibdev->smcibcq[i].ib_cq && ibdev->smcibcq[i].ib_cq->dim) { - struct smc_dim *dim = to_smcdim(ibdev->smcibcq[i].ib_cq->dim); - - seq_printf(seq, "%-6s%-6d%-6d%-6d%-6s%-6d%-6d%-6d%-8d%-8d%-8d\n", - dim->use_dim ? "ON" : "OFF", dim->prev_idle_percent, - dim->prev_si_percent, dim->dim.profile_ix, - state_str[dim->dim.tune_state], dim->dim.steps_left, - dim->dim.steps_right, dim->dim.tired, - dim->dim.prev_stats.cpms, dim->dim.prev_stats.epms, - dim->dim.prev_stats.cpe_ratio); - } else { - seq_puts(seq, " - - - - - - - - - - -\n"); - } - } - } - mutex_unlock(&smc_ib_devices.mutex); - return 0; -} - -static int proc_open_dim(struct inode *inode, struct file *file) -{ - single_open(file, proc_show_dim, NULL); - return 0; -} - -static struct proc_ops dim_file_ops = { -.proc_open = proc_open_dim, -.proc_read = seq_read, -.proc_release = single_release, -}; - -static int __net_init smc_proc_dir_init(struct net *net) -{ - struct proc_dir_entry *proc_net_smc; - int i, rc = -ENOMEM; - - proc_net_smc = proc_net_mkdir(net, "smc", net->proc_net); - if (!proc_net_smc) - goto err; - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) { - if (!proc_create_net_data(smc_proc[i].name, 0444, - proc_net_smc, &smc_proc[i].ops, - sizeof(struct smc_proc_private), - NULL)) - goto err_entry; - } - - if (!proc_create("dim", 0444, proc_net_smc, &dim_file_ops)) - goto err_entry; - - net->proc_net_smc = proc_net_smc; - return 0; - -err_entry: - for (i -= 1; i >= 0; i--) - remove_proc_entry(smc_proc[i].name, proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -err: - return rc; -} - -static void __net_exit smc_proc_dir_exit(struct net *net) -{ - int i; - struct proc_dir_entry *proc_net_smc = net->proc_net_smc; - - remove_proc_entry("dim", proc_net_smc); - - for (i = 0; i < ARRAY_SIZE(smc_proc); i++) - remove_proc_entry(smc_proc[i].name, proc_net_smc); - - remove_proc_entry("smc", net->proc_net); -} - -static struct pernet_operations smc_proc_ops = { - .init = smc_proc_dir_init, - .exit = smc_proc_dir_exit, -}; - -int __init smc_proc_init(void) -{ - return register_pernet_subsys(&smc_proc_ops); -} - -void smc_proc_exit(void) -{ - unregister_pernet_subsys(&smc_proc_ops); -} diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h deleted file mode 100644 index ec59ca03e163..000000000000 --- a/net/smc/smc_proc.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _SMC_PROC_H_ -#define _SMC_PROC_H_ - -#include -#include -#include -#include -#include -#include "smc.h" - -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") -#define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") - -struct smc_proc_private { - struct seq_net_private p; - int num, bucket, offset; - int protocol; - loff_t last_pos; -}; - -struct smc_proc_entry { - const char *name; - const struct seq_operations ops; -}; - -int __init smc_proc_init(void); -void smc_proc_exit(void); - -#endif diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index ed99d2f49195..e80e34f7ac15 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -17,12 +17,6 @@ #include #include "smc_netlink.h" #include "smc_stats.h" -#include "smc_core.h" - -struct smc_lgr_list smc_lgr_stats_list = {/* statistic link groups */ - .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_stats_list.lock), - .list = LIST_HEAD_INIT(smc_lgr_stats_list.list), -}; int smc_stats_init(struct net *net) { @@ -48,46 +42,6 @@ void smc_stats_exit(struct net *net) free_percpu(net->smc.smc_stats); } -int smc_lgr_link_stats_init(struct smc_link_group *lgr) -{ - struct list_head *lgr_stats_list = &smc_lgr_stats_list.list; - spinlock_t *lgr_stats_lock = &smc_lgr_stats_list.lock; - int i, j; - - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - lgr->lnk_stats[i].ib_stats = - alloc_percpu(struct smc_link_ib_stats); - if (!lgr->lnk_stats[i].ib_stats) - goto err; - } - spin_lock_bh(lgr_stats_lock); - list_add_tail(&lgr->stats_list, lgr_stats_list); - spin_unlock_bh(lgr_stats_lock); - return 0; - -err: - for (j = i - 1; j >= 0; j--) { - free_percpu(lgr->lnk_stats[j].ib_stats); - lgr->lnk_stats[j].ib_stats = NULL; - } - return -ENOMEM; -} - -void smc_lgr_link_stats_free(struct smc_link_group *lgr) -{ - spinlock_t *lgr_stats_lock = &smc_lgr_stats_list.lock; - int i; - - spin_lock_bh(lgr_stats_lock); - list_del_init(&lgr->stats_list); - spin_unlock_bh(lgr_stats_lock); - - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - free_percpu(lgr->lnk_stats[i].ib_stats); - lgr->lnk_stats[i].ib_stats = NULL; - } -} - static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 311de65b6fce..84b7ecd8c05c 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -88,51 +88,6 @@ struct smc_stats { u64 srv_hshake_err_cnt; }; -struct smc_link_ib_stats { - u64 s_wr_cnt; - u64 s_wc_cnt; - u64 r_wr_cnt; - u64 r_wc_cnt; - u64 rw_wc_cnt; -}; - -struct smc_link_stats { - struct smc_link_ib_stats __percpu *ib_stats; - u32 qpn; - u32 peer_qpn; -}; - -#define SMC_LINK_STAT_IB(_lnk_stats, op, elem) \ - this_cpu_inc((_lnk_stats)->ib_stats->op ## _ ## elem ## _cnt) - -#define SMC_LINK_STAT_WC(lnk_stats, opcode, is_rx) \ -do { \ - typeof(lnk_stats) _lnk_stats = lnk_stats; \ - typeof(opcode) op = opcode; \ - if (is_rx) { \ - SMC_LINK_STAT_IB(_lnk_stats, r, wc); \ - } else { \ - if (op == IB_WC_SEND || op == IB_WC_REG_MR) \ - SMC_LINK_STAT_IB(_lnk_stats, s, wc); \ - if (op == IB_WC_RDMA_WRITE) \ - SMC_LINK_STAT_IB(_lnk_stats, rw, wc); \ - } \ -} \ -while (0) - -#define SMC_LINK_STAT_WR(lnk_stats, opcode, is_rx) \ -do { \ - typeof(lnk_stats) _lnk_stats = lnk_stats; \ - typeof(opcode) op = opcode; \ - if (is_rx) { \ - SMC_LINK_STAT_IB(_lnk_stats, r, wr); \ - } else { \ - if (op == IB_WR_SEND || op == IB_WR_REG_MR) \ - SMC_LINK_STAT_IB(_lnk_stats, s, wr); \ - } \ -} \ -while (0) - #define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \ do { \ typeof(_smc_stats) stats = (_smc_stats); \ @@ -305,8 +260,6 @@ while (0) int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); -int smc_lgr_link_stats_init(struct smc_link_group *lgr); -void smc_lgr_link_stats_free(struct smc_link_group *lgr); int smc_stats_init(struct net *net); void smc_stats_exit(struct net *net); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index d6324144cdf3..6ed67835c687 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -63,31 +63,6 @@ static struct ctl_table smc_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, - { - .procname = "tcp2smc", - .data = &init_net.smc.sysctl_tcp2smc, - .maxlen = sizeof(init_net.smc.sysctl_tcp2smc), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "allow_different_subnet", - .data = &init_net.smc.sysctl_allow_different_subnet, - .maxlen = sizeof(init_net.smc.sysctl_allow_different_subnet), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "limit_handshake", - .data = &init_net.smc.limit_smc_hs, - .maxlen = sizeof(init_net.smc.limit_smc_hs), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, { } }; @@ -114,10 +89,8 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; - net->smc.sysctl_wmem = 262144; /* 256 KiB */ - net->smc.sysctl_rmem = 262144; /* 256 KiB */ - net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 1; + WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); return 0; err_reg: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 1c0c2a411e97..64dedffe9d26 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -347,12 +347,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; - /* rtoken might be deleted if peer freed connection */ - if (!rdma_wr->rkey || - (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { - pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); - return -EINVAL; - } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 0eff796ca5a9..26f8f240d9e8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -30,7 +30,6 @@ #include "smc.h" #include "smc_wr.h" -#include "smc_dim.h" #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ @@ -134,6 +133,39 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) wake_up(&link->wr_tx_wait); } +static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) +{ + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int i = 0, rc; + int polled = 0; + +again: + polled++; + do { + memset(&wc, 0, sizeof(wc)); + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_send, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + for (i = 0; i < rc; i++) + smc_wr_tx_process_cqe(&wc[i]); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->send_tasklet); +} + /*---------------------------- request submission ---------------------------*/ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) @@ -274,14 +306,13 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); - } else { - SMC_LINK_STAT_WR(&link->lgr->lnk_stats[link->link_idx], - link->wr_tx_ibs[pend->idx].opcode, 0); } return rc; } @@ -292,13 +323,12 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); - } else { - SMC_LINK_STAT_WR(&link->lgr->lnk_stats[link->link_idx], - link->wr_tx_v2_ib->opcode, 0); } return rc; } @@ -337,6 +367,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; link->wr_reg.mr = mr; @@ -344,9 +376,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); if (rc) return rc; - else - SMC_LINK_STAT_WR(&link->lgr->lnk_stats[link->link_idx], - link->wr_reg.wr.opcode, 0); atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, @@ -402,8 +431,6 @@ int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - int rx_buf_size = (link->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; struct smc_wr_rx_handler *handler; struct smc_wr_rx_hdr *wr_rx; u64 temp_wr_id; @@ -411,84 +438,72 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) if (wc->byte_len < sizeof(*wr_rx)) return; /* short message */ - temp_wr_id = wc->wr_id / 2; + temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)((u8 *)link->wr_rx_bufs + index * rx_buf_size); + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); } } -static inline void smc_wr_rx_process_cqe(struct ib_wc *wc) +static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) { - struct smc_link *link = wc->qp->qp_context; + struct smc_link *link; + int i; - if (wc->status == IB_WC_SUCCESS) { - link->wr_rx_tstamp = jiffies; - smc_wr_rx_demultiplex(wc); - smc_wr_rx_post(link); /* refill WR RX */ - } else { - /* handle status errors */ - switch (wc->status) { - case IB_WC_RETRY_EXC_ERR: - case IB_WC_RNR_RETRY_EXC_ERR: - case IB_WC_WR_FLUSH_ERR: - smcr_link_down_cond_sched(link); - break; - default: + for (i = 0; i < num; i++) { + link = wc[i].qp->qp_context; + if (wc[i].status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(&wc[i]); smc_wr_rx_post(link); /* refill WR RX */ - break; + } else { + /* handle status errors */ + switch (wc[i].status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); + break; + default: + smc_wr_rx_post(link); /* refill WR RX */ + break; + } } } } -static void smc_wr_tasklet_fn(struct tasklet_struct *t) +static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - struct smc_link_stats *lnk_stats; - int i, rc, completed = 0; - struct smc_link *link; + int polled = 0; + int rc; again: + polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); - for (i = 0; i < rc; i++) { - link = wc[i].qp->qp_context; - lnk_stats = &link->lgr->lnk_stats[link->link_idx]; - if (smc_wr_id_is_rx(wc[i].wr_id)) { - SMC_LINK_STAT_WC(lnk_stats, wc[i].opcode, 1); - smc_wr_rx_process_cqe(&wc[i]); - } else { - SMC_LINK_STAT_WC(lnk_stats, wc[i].opcode, 0); - smc_wr_tx_process_cqe(&wc[i]); - } + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_recv, + IB_CQ_SOLICITED_MASK + | IB_CQ_REPORT_MISSED_EVENTS); } - - if (rc > 0) - completed += rc; + if (!rc) + break; + smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - - /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, - * then it is safe to wait for the next event; else we must poll the - * CQ again to make sure we won't miss any event. - */ - if (ib_req_notify_cq(smcibcq->ib_cq, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS) > 0) + if (polled == 1) goto again; - - if (smcibcq->ib_cq->dim) - smc_dim(smcibcq->ib_cq->dim, completed); } -void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context) +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; - tasklet_schedule(&smcibcq->tasklet); + tasklet_schedule(&dev->recv_tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -505,8 +520,6 @@ int smc_wr_rx_post_init(struct smc_link *link) void smc_wr_remember_qp_attr(struct smc_link *lnk) { - struct smc_link_stats *lnk_stats = - &lnk->lgr->lnk_stats[lnk->link_idx]; struct ib_qp_attr *attr = &lnk->qp_attr; struct ib_qp_init_attr init_attr; @@ -536,11 +549,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->qp_attr.cap.max_send_wr); lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, lnk->qp_attr.cap.max_recv_wr); - lnk_stats->qpn = lnk->roce_qp->qp_num; } static void smc_wr_init_sge(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; @@ -561,7 +574,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; - lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED; + lnk->wr_tx_ibs[i].send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; if (send_inline) lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; @@ -581,7 +595,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; lnk->wr_tx_v2_ib->num_sge = 1; lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; - lnk->wr_tx_v2_ib->send_flags = IB_SEND_SIGNALED; + lnk->wr_tx_v2_ib->send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; } /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. @@ -589,27 +604,25 @@ static void smc_wr_init_sge(struct smc_link *lnk) * and the same buffer for all sges. When a larger message arrived then * the content of the first small sge is copied to the beginning of * the larger spillover buffer, allowing easy data mapping. - * - * Noticed that the following is a temporary workaround in eRDMA - * situation: - * wr_rx_bufs is alloced as SMC_WR_BUF_SIZE with SMC-Rv1, and as - * SMC_WR_BUF_V2_SIZE with SMC-Rv2. This will bring extra memory - * consumption in SMC-Rv2 compared to upstream design, and will - * be revert once eRDMA supports max_recv_sge larger than 1. */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - int rx_msg_size = (lnk->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_TX_SIZE; - int rx_buf_size = (lnk->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; - - lnk->wr_rx_sges[i].addr = - lnk->wr_rx_dma_addr + i * rx_buf_size; - lnk->wr_rx_sges[i].length = rx_msg_size; - lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + int x = i * sges_per_buf; + + lnk->wr_rx_sges[x].addr = + lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_sges[x + 1].addr = + lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].length = + SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].lkey = + lnk->roce_pd->local_dma_lkey; + } lnk->wr_rx_ibs[i].next = NULL; - lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; - lnk->wr_rx_ibs[i].num_sge = 1; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; + lnk->wr_rx_ibs[i].num_sge = sges_per_buf; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -639,6 +652,12 @@ void smc_wr_free_link(struct smc_link *lnk) DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } if (lnk->wr_tx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, @@ -658,6 +677,8 @@ void smc_wr_free_lgr_mem(struct smc_link_group *lgr) if (lgr->smc_version < SMC_V2) return; + kfree(lgr->wr_rx_buf_v2); + lgr->wr_rx_buf_v2 = NULL; kfree(lgr->wr_tx_buf_v2); lgr->wr_tx_buf_v2 = NULL; } @@ -699,22 +720,26 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) if (lgr->smc_version < SMC_V2) return 0; + lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_rx_buf_v2) + return -ENOMEM; lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); - if (!lgr->wr_tx_buf_v2) + if (!lgr->wr_tx_buf_v2) { + kfree(lgr->wr_rx_buf_v2); return -ENOMEM; + } return 0; } int smc_wr_alloc_link_mem(struct smc_link *link) { - int rx_buf_size = (link->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; + int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, rx_buf_size, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -741,7 +766,8 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_sges[0]), + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -807,33 +833,25 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq; i++) - tasklet_kill(&smcibdev->smcibcq[i].tasklet); + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - int i; - - for (i = 0; i < smcibdev->num_cq; i++) { - tasklet_setup(&smcibdev->smcibcq[i].tasklet, - smc_wr_tasklet_fn); - } + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) { - int rx_buf_size = (lnk->lgr->smc_version == SMC_V2) ? - SMC_WR_BUF_V2_SIZE : SMC_WR_BUF_SIZE; struct ib_device *ibdev = lnk->smcibdev->ibdev; int rc = 0; smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); - lnk->wr_rx_id = 1; + lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, rx_buf_size * lnk->wr_rx_cnt, + ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { lnk->wr_rx_dma_addr = 0; @@ -841,6 +859,14 @@ int smc_wr_create_link(struct smc_link *lnk) goto out; } if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, DMA_TO_DEVICE); @@ -866,6 +892,12 @@ int smc_wr_create_link(struct smc_link *lnk) return rc; dma_unmap: + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } if (lnk->wr_tx_v2_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, SMC_WR_BUF_V2_SIZE, diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index fca395701be3..a54e90a1110f 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -51,7 +51,7 @@ struct smc_wr_rx_handler { */ static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link) { - return atomic_long_add_return(2, &link->wr_tx_id); + return atomic_long_inc_return(&link->wr_tx_id); } static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) @@ -86,28 +86,18 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { - struct smc_link_stats *lnk_stats = - &link->lgr->lnk_stats[link->link_idx]; int rc; u64 wr_id, temp_wr_id; u32 index; - link->wr_rx_id += 2; - wr_id = link->wr_rx_id; /* tasklet context, thus not atomic */ - temp_wr_id = wr_id / 2; + wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */ + temp_wr_id = wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); - if (!rc) - SMC_LINK_STAT_WR(lnk_stats, 0, 1); return rc; } -static inline bool smc_wr_id_is_rx(u64 wr_id) -{ - return wr_id % 2; -} - int smc_wr_create_link(struct smc_link *lnk); int smc_wr_alloc_link_mem(struct smc_link *lnk); int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); @@ -134,11 +124,12 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); -void smc_wr_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); #endif /* SMC_WR_H */ diff --git a/net/socket.c b/net/socket.c index 96860a0f9330..d52c265ad449 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1367,14 +1367,6 @@ int __sock_create(struct net *net, int family, int type, int protocol, current->comm); family = PF_PACKET; } -#if IS_ENABLED(CONFIG_SMC) - if (!kern && (family == AF_INET || family == AF_INET6) && - type == SOCK_STREAM && (protocol == IPPROTO_IP || - protocol == IPPROTO_TCP) && net->smc.sysctl_tcp2smc) { - protocol = (family == AF_INET) ? SMCPROTO_SMC : SMCPROTO_SMC6; - family = AF_SMC; - } -#endif err = security_socket_create(family, type, protocol, kern); if (err) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_smc.c b/tools/testing/selftests/bpf/prog_tests/bpf_smc.c deleted file mode 100644 index b57326c26544..000000000000 --- a/tools/testing/selftests/bpf/prog_tests/bpf_smc.c +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include "bpf_smc.skel.h" - -void test_bpf_smc(void) -{ - struct bpf_smc *smc_skel; - struct bpf_link *link; - int err; - - smc_skel = bpf_smc__open(); - if (!ASSERT_OK_PTR(smc_skel, "skel_open")) - return; - - err = bpf_map__set_type(smc_skel->maps.negotiator_map, BPF_MAP_TYPE_HASH); - if (!ASSERT_OK(err, "bpf_map__set_type")) - goto error; - - err = bpf_map__set_max_entries(smc_skel->maps.negotiator_map, 1); - if (!ASSERT_OK(err, "bpf_map__set_type")) - goto error; - - err = bpf_smc__load(smc_skel); - if (!ASSERT_OK(err, "skel_load")) - goto error; - - link = bpf_map__attach_struct_ops(smc_skel->maps.ops); - if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) - goto error; - - bpf_link__destroy(link); -error: - bpf_smc__destroy(smc_skel); -} diff --git a/tools/testing/selftests/bpf/progs/bpf_smc.c b/tools/testing/selftests/bpf/progs/bpf_smc.c deleted file mode 100644 index 4f4a5416c3c9..000000000000 --- a/tools/testing/selftests/bpf/progs/bpf_smc.c +++ /dev/null @@ -1,320 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include -#include -#include -#include -#include -#include -#include -#include - -#define AF_SMC (43) -#define SMC_LISTEN (10) -#define SMC_SOCK_CLOSED_TIMING (0) -extern unsigned long CONFIG_HZ __kconfig; -#define HZ CONFIG_HZ - -char _license[] SEC("license") = "GPL"; -#define max(a, b) ((a) > (b) ? (a) : (b)) - -struct sock_common { - unsigned char skc_state; - unsigned short skc_family; - __u16 skc_num; -} __attribute__((preserve_access_index)); - -struct sock { - struct sock_common __sk_common; - int sk_sndbuf; -} __attribute__((preserve_access_index)); - -struct inet_sock { - struct sock sk; -} __attribute__((preserve_access_index)); - -struct inet_connection_sock { - struct inet_sock icsk_inet; -} __attribute__((preserve_access_index)); - -struct tcp_sock { - struct inet_connection_sock inet_conn; - __u32 rcv_nxt; - __u32 snd_nxt; - __u32 snd_una; - __u32 delivered; - __u8 syn_data:1, /* SYN includes data */ - syn_fastopen:1, /* SYN includes Fast Open option */ - syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ - syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ -} __attribute__((preserve_access_index)); - -struct socket { - struct sock *sk; -} __attribute__((preserve_access_index)); - -union smc_host_cursor { - struct { - __u16 reserved; - __u16 wrap; - __u32 count; - }; -} __attribute__((preserve_access_index)); - -struct smc_connection { - union smc_host_cursor tx_curs_sent; - union smc_host_cursor rx_curs_confirmed; -} __attribute__((preserve_access_index)); - -struct smc_sock { - struct sock sk; - struct socket *clcsock; /* internal tcp socket */ - struct smc_connection conn; - int use_fallback; -} __attribute__((preserve_access_index)); - -static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static __always_inline struct smc_sock *smc_sk(struct sock *sk) -{ - return (struct smc_sock *)sk; -} - -struct smc_prediction { - /* protection for smc_prediction */ - struct bpf_spin_lock lock; - /* start of time slice */ - __u64 start_tstamp; - /* delta of pacing */ - __u64 pacing_delta; - /* N of closed connections determined as long connections - * in current time slice - */ - __u32 closed_long_cc; - /* N of closed connections in this time slice */ - __u32 closed_total_cc; - /* N of incoming connections determined as long connections - * in current time slice - */ - __u32 incoming_long_cc; - /* last splice rate of long cc */ - __u32 last_rate_of_lcc; -}; - -#define SMC_PREDICTION_MIN_PACING_DELTA (1llu) -#define SMC_PREDICTION_MAX_PACING_DELTA (HZ << 3) -#define SMC_PREDICTION_MAX_LONGCC_PER_SPLICE (8) -#define SMC_PREDICTION_MAX_PORT (64) -#define SMC_PREDICTION_MAX_SPLICE_GAP (1) -#define SMC_PREDICTION_LONGCC_RATE_THRESHOLD (13189) -#define SMC_PREDICTION_LONGCC_PACKETS_THRESHOLD (100) -#define SMC_PREDICTION_LONGCC_BYTES_THRESHOLD \ - (SMC_PREDICTION_LONGCC_PACKETS_THRESHOLD * 1024) - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, SMC_PREDICTION_MAX_PORT); - __type(key, __u16); - __type(value, struct smc_prediction); -} negotiator_map SEC(".maps"); - - -static inline __u32 smc_prediction_calt_rate(struct smc_prediction *smc_predictor) -{ - if (!smc_predictor->closed_total_cc) - return smc_predictor->last_rate_of_lcc; - - return (smc_predictor->closed_long_cc << 14) / smc_predictor->closed_total_cc; -} - -static inline struct smc_prediction *smc_prediction_get(__u16 key, __u64 tstamp) -{ - struct smc_prediction zero = {}, *smc_predictor; - __u32 gap; - int err; - - smc_predictor = bpf_map_lookup_elem(&negotiator_map, &key); - if (!smc_predictor) { - zero.start_tstamp = bpf_jiffies64(); - zero.pacing_delta = SMC_PREDICTION_MIN_PACING_DELTA; - err = bpf_map_update_elem(&negotiator_map, &key, &zero, 0); - if (err) - return NULL; - smc_predictor = bpf_map_lookup_elem(&negotiator_map, &key); - if (!smc_predictor) - return NULL; - } - - if (tstamp) { - bpf_spin_lock(&smc_predictor->lock); - gap = (tstamp - smc_predictor->start_tstamp) / smc_predictor->pacing_delta; - /* new splice */ - if (gap > 0) { - smc_predictor->start_tstamp = tstamp; - smc_predictor->last_rate_of_lcc = - (smc_prediction_calt_rate(smc_predictor) * 7) >> (2 + gap); - smc_predictor->closed_long_cc = 0; - smc_predictor->closed_total_cc = 0; - smc_predictor->incoming_long_cc = 0; - } - bpf_spin_unlock(&smc_predictor->lock); - } - return smc_predictor; -} - -/* BPF struct ops for smc protocol negotiator */ -struct smc_sock_negotiator_ops { - /* ret for negotiate */ - int (*negotiate)(struct smc_sock *smc); - - /* info gathering timing */ - void (*collect_info)(struct smc_sock *smc, int timing); -}; - -int SEC("struct_ops/bpf_smc_negotiate") -BPF_PROG(bpf_smc_negotiate, struct smc_sock *smc) -{ - struct smc_prediction *smc_predictor; - int err, ret = SK_DROP; - struct tcp_sock *tp; - struct sock *clcsk; - __u32 rate = 0; - __u16 key; - - /* client side */ - if (smc == NULL || smc->sk.__sk_common.skc_state != SMC_LISTEN) { - /* use Global smc_predictor */ - key = 0; - } else { /* server side */ - clcsk = BPF_CORE_READ(smc, clcsock, sk); - if (!clcsk) - goto error; - tp = tcp_sk(clcsk); - err = bpf_core_read(&key, sizeof(__u16), - &tp->inet_conn.icsk_inet.sk.__sk_common.skc_num); - if (err) - goto error; - } - - smc_predictor = smc_prediction_get(key, bpf_jiffies64()); - if (!smc_predictor) - return SK_PASS; - - bpf_spin_lock(&smc_predictor->lock); - - if (smc_predictor->incoming_long_cc == 0) - goto out_locked_pass; - - if (smc_predictor->incoming_long_cc > SMC_PREDICTION_MAX_LONGCC_PER_SPLICE) - goto out_locked_drop; - - rate = smc_prediction_calt_rate(smc_predictor); - if (rate < SMC_PREDICTION_LONGCC_RATE_THRESHOLD) - goto out_locked_drop; - -out_locked_pass: - smc_predictor->incoming_long_cc++; - bpf_spin_unlock(&smc_predictor->lock); - return SK_PASS; -out_locked_drop: - bpf_spin_unlock(&smc_predictor->lock); -error: - return SK_DROP; -} - -void SEC("struct_ops/bpf_smc_collect_info") -BPF_PROG(bpf_smc_collect_info, struct sock *sk, int timing) -{ - struct smc_prediction *smc_predictor; - int use_fallback, sndbuf, err; - struct smc_sock *smc; - struct tcp_sock *tp; - struct sock *clcsk; - bool match = false; - __u16 wrap, count; - __u32 delivered; - __u16 key; - - /* no info can collect */ - if (sk == NULL) - return; - - /* only fouces on closed */ - if (timing != SMC_SOCK_CLOSED_TIMING) - return; - - /* first check the sk type */ - if (sk->__sk_common.skc_family == AF_SMC) { - smc = smc_sk(sk); - clcsk = BPF_CORE_READ(smc, clcsock, sk); - if (!clcsk) - goto error; - tp = tcp_sk(clcsk); - /* check if it's fallback */ - err = bpf_core_read(&use_fallback, sizeof(use_fallback), &smc->use_fallback); - if (err) - goto error; - if (use_fallback) - goto fallback; - err = bpf_core_read(&wrap, sizeof(__u16), &smc->conn.tx_curs_sent.wrap); - if (err) - goto error; - err = bpf_core_read(&count, sizeof(__u16), &smc->conn.tx_curs_sent.count); - if (err) - goto error; - err = bpf_core_read(&sndbuf, sizeof(int), &clcsk->sk_sndbuf); - if (err) - goto error; - match = (count + wrap * sndbuf) > SMC_PREDICTION_LONGCC_BYTES_THRESHOLD; - } else { - smc = NULL; - tp = tcp_sk(sk); - use_fallback = 1; -fallback: - err = bpf_core_read(&delivered, sizeof(delivered), &tp->delivered); - if (err) - goto error; - match = (delivered > SMC_PREDICTION_LONGCC_PACKETS_THRESHOLD); - } - - /* whatever, tp is never NULL */ - err = bpf_core_read(&key, sizeof(__u16), &tp->inet_conn.icsk_inet.sk.__sk_common.skc_num); - if (err) - goto error; - - smc_predictor = smc_prediction_get(key, 0); - if (!smc_predictor) - goto error; - - bpf_spin_lock(&smc_predictor->lock); - smc_predictor->closed_total_cc++; - if (match) { - /* increase stats */ - smc_predictor->closed_long_cc++; - /* try more aggressive */ - if (smc_predictor->pacing_delta > SMC_PREDICTION_MIN_PACING_DELTA) { - if (use_fallback) { - smc_predictor->pacing_delta = max(SMC_PREDICTION_MIN_PACING_DELTA, - (smc_predictor->pacing_delta * 3) >> 2); - } - } - } else if (!use_fallback) { - smc_predictor->pacing_delta <<= 1; - } - bpf_spin_unlock(&smc_predictor->lock); -error: - return; -} - -SEC(".struct_ops") -struct smc_sock_negotiator_ops ops = { - .negotiate = (void *)bpf_smc_negotiate, - .collect_info = (void *)bpf_smc_collect_info, -}; -- Gitee From 49db7e45343739cdd6e08a8a0490e662d0c2851e Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:45 +0100 Subject: [PATCH 02/21] net/smc: Terminate connections prior to device removal ANBZ: #5534 commit c40bff4132e5c1320635ae809d001ccb5598dac6 upstream. Removing an ISM device prior to terminating its associated connections doesn't end well. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/smc_ism.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 98de2bc61483..71e38768fea2 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -461,11 +461,11 @@ void smcd_unregister_dev(struct smcd_dev *smcd) { pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&smcd->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); destroy_workqueue(smcd->event_wq); device_del(&smcd->dev); -- Gitee From 7a0f7b203c34c104d0882dc9be04f6a48ab88a32 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:46 +0100 Subject: [PATCH 03/21] net/ism: Add missing calls to disable bus-mastering ANBZ: #5534 commit 462502ff9acb7bb02405e3e486428472db7c48dc upstream. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 1adb00ca0a0a..6d8be79c9229 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -581,6 +581,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_free: smcd_free_dev(ism->smcd); err_resource: + pci_clear_master(pdev); pci_release_mem_regions(pdev); err_disable: pci_disable_device(pdev); @@ -611,6 +612,7 @@ static void ism_remove(struct pci_dev *pdev) ism_dev_exit(ism); smcd_free_dev(ism->smcd); + pci_clear_master(pdev); pci_release_mem_regions(pdev); pci_disable_device(pdev); dev_set_drvdata(&pdev->dev, NULL); -- Gitee From 6cebdf8e4fb1ef960bd5eaa8787003860128fae7 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:47 +0100 Subject: [PATCH 04/21] s390/ism: Introduce struct ism_dmb ANBZ: #5534 commit 1baedb13f1d50ae8c7852134fdf934b4463e9baa upstream. Conceptually, a DMB is a structure that belongs to ISM devices. However, SMC currently 'owns' this structure. So future exploiters of ISM devices would be forced to include SMC headers to work - which is just weird. Therefore, we switch ISM to struct ism_dmb, introduce a new public header with the definition (will be populated with further API calls later on), and, add a thin wrapper to please SMC. Since structs smcd_dmb and ism_dmb are identical, we can simply convert between the two for now. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism.h | 1 + drivers/s390/net/ism_drv.c | 22 ++++++++++++++++------ include/linux/ism.h | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 include/linux/ism.h diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 38fe90c2597d..90af51370183 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 6d8be79c9229..abd09200bee6 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -215,14 +215,14 @@ static int ism_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, return ism_cmd(ism, &cmd); } -static void ism_free_dmb(struct ism_dev *ism, struct smcd_dmb *dmb) +static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, dmb->cpu_addr, dmb->dma_addr); } -static int ism_alloc_dmb(struct ism_dev *ism, struct smcd_dmb *dmb) +static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { unsigned long bit; @@ -250,7 +250,7 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct smcd_dmb *dmb) return dmb->cpu_addr ? 0 : -ENOMEM; } -static int ism_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +static int ism_register_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) { struct ism_dev *ism = smcd->priv; union ism_reg_dmb cmd; @@ -281,7 +281,12 @@ static int ism_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) return ret; } -static int ism_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + return ism_register_dmb(smcd, (struct ism_dmb *)dmb); +} + +static int ism_unregister_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) { struct ism_dev *ism = smcd->priv; union ism_unreg_dmb cmd; @@ -302,6 +307,11 @@ static int ism_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) return ret; } +static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + return ism_unregister_dmb(smcd, (struct ism_dmb *)dmb); +} + static int ism_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) { struct ism_dev *ism = smcd->priv; @@ -474,8 +484,8 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct smcd_ops ism_ops = { .query_remote_gid = ism_query_rgid, - .register_dmb = ism_register_dmb, - .unregister_dmb = ism_unregister_dmb, + .register_dmb = smcd_register_dmb, + .unregister_dmb = smcd_unregister_dmb, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, .set_vlan_required = ism_set_vlan_required, diff --git a/include/linux/ism.h b/include/linux/ism.h new file mode 100644 index 000000000000..69bfbf0faaa1 --- /dev/null +++ b/include/linux/ism.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Internal Shared Memory + * + * Definitions for the ISM module + * + * Copyright IBM Corp. 2022 + */ +#ifndef _ISM_H +#define _ISM_H + +struct ism_dmb { + u64 dmb_tok; + u64 rgid; + u32 dmb_len; + u32 sba_idx; + u32 vlan_valid; + u32 vlan_id; + void *cpu_addr; + dma_addr_t dma_addr; +}; + +#endif /* _ISM_H */ -- Gitee From 66ecb8368721f9f7b8b1f2d3fec18286710d52c6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:48 +0100 Subject: [PATCH 05/21] net/ism: Add new API for client registration ANBZ: #5534 commit 89e7d2ba61b742a7525ff06ea4d4378c4a5560d0 upstream. Add a new API that allows other drivers to concurrently access ISM devices. To do so, we introduce a new API that allows other modules to register for ISM device usage. Furthermore, we move the GID to struct ism, where it belongs conceptually, and rename and relocate struct smcd_event to struct ism_event. This is the first part of a bigger overhaul of the interfaces between SMC and ISM. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism.h | 18 +--- drivers/s390/net/ism_drv.c | 172 +++++++++++++++++++++++++++++++++++-- include/linux/ism.h | 67 +++++++++++++++ include/net/smc.h | 18 ++-- net/smc/smc_ism.c | 4 +- 5 files changed, 243 insertions(+), 36 deletions(-) diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 90af51370183..70c5bbda0fea 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -16,7 +16,6 @@ */ #define ISM_DMB_WORD_OFFSET 1 #define ISM_DMB_BIT_OFFSET (ISM_DMB_WORD_OFFSET * 32) -#define ISM_NR_DMBS 1920 #define ISM_IDENT_MASK 0x00FFFF #define ISM_REG_SBA 0x1 @@ -178,7 +177,7 @@ struct ism_eq_header { struct ism_eq { struct ism_eq_header header; - struct smcd_event entry[15]; + struct ism_event entry[15]; }; struct ism_sba { @@ -190,21 +189,6 @@ struct ism_sba { u16 dmbe_mask[ISM_NR_DMBS]; }; -struct ism_dev { - spinlock_t lock; - struct pci_dev *pdev; - struct smcd_dev *smcd; - - struct ism_sba *sba; - dma_addr_t sba_dma_addr; - DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); - - struct ism_eq *ieq; - dma_addr_t ieq_dma_addr; - - int ieq_idx; -}; - #define ISM_CREATE_REQ(dmb, idx, sf, offset) \ ((dmb) | (idx) << 24 | (sf) << 23 | (offset)) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index abd09200bee6..0b6896dd6073 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -15,9 +15,6 @@ #include #include #include -#include - -#include #include "ism.h" @@ -34,6 +31,84 @@ static const struct pci_device_id ism_device_table[] = { MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; +static const struct smcd_ops ism_ops; + +#define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ +static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ + /* a list for fast mapping */ +static u8 max_client; +static DEFINE_SPINLOCK(clients_lock); +struct ism_dev_list { + struct list_head list; + struct mutex mutex; /* protects ism device list */ +}; + +static struct ism_dev_list ism_dev_list = { + .list = LIST_HEAD_INIT(ism_dev_list.list), + .mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex), +}; + +int ism_register_client(struct ism_client *client) +{ + struct ism_dev *ism; + unsigned long flags; + int i, rc = -ENOSPC; + + mutex_lock(&ism_dev_list.mutex); + spin_lock_irqsave(&clients_lock, flags); + for (i = 0; i < MAX_CLIENTS; ++i) { + if (!clients[i]) { + clients[i] = client; + client->id = i; + if (i == max_client) + max_client++; + rc = 0; + break; + } + } + spin_unlock_irqrestore(&clients_lock, flags); + if (i < MAX_CLIENTS) { + /* initialize with all devices that we got so far */ + list_for_each_entry(ism, &ism_dev_list.list, list) { + ism->priv[i] = NULL; + client->add(ism); + } + } + mutex_unlock(&ism_dev_list.mutex); + + return rc; +} +EXPORT_SYMBOL_GPL(ism_register_client); + +int ism_unregister_client(struct ism_client *client) +{ + struct ism_dev *ism; + unsigned long flags; + int rc = 0; + + mutex_lock(&ism_dev_list.mutex); + spin_lock_irqsave(&clients_lock, flags); + clients[client->id] = NULL; + if (client->id + 1 == max_client) + max_client--; + spin_unlock_irqrestore(&clients_lock, flags); + list_for_each_entry(ism, &ism_dev_list.list, list) { + for (int i = 0; i < ISM_NR_DMBS; ++i) { + if (ism->sba_client_arr[i] == client->id) { + pr_err("%s: attempt to unregister client '%s'" + "with registered dmb(s)\n", __func__, + client->name); + rc = -EBUSY; + goto out; + } + } + } +out: + mutex_unlock(&ism_dev_list.mutex); + + return rc; +} +EXPORT_SYMBOL_GPL(ism_unregister_client); static int ism_cmd(struct ism_dev *ism, void *cmd) { @@ -193,7 +268,7 @@ static int ism_read_local_gid(struct ism_dev *ism) if (ret) goto out; - ism->smcd->local_gid = cmd.response.gid; + ism->local_gid = cmd.response.gid; out: return ret; } @@ -436,7 +511,8 @@ static u16 ism_get_chid(struct smcd_dev *smcd) static void ism_handle_event(struct ism_dev *ism) { - struct smcd_event *entry; + struct ism_event *entry; + int i; while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) { if (++(ism->ieq_idx) == ARRAY_SIZE(ism->ieq->entry)) @@ -444,13 +520,18 @@ static void ism_handle_event(struct ism_dev *ism) entry = &ism->ieq->entry[ism->ieq_idx]; debug_event(ism_debug_info, 2, entry, sizeof(*entry)); - smcd_handle_event(ism->smcd, entry); + spin_lock(&clients_lock); + for (i = 0; i < max_client; ++i) + if (clients[i]) + clients[i]->handle_event(ism, entry); + spin_unlock(&clients_lock); } } static irqreturn_t ism_handle_irq(int irq, void *data) { struct ism_dev *ism = data; + struct ism_client *clt; unsigned long bit, end; unsigned long *bv; u16 dmbemask; @@ -470,7 +551,8 @@ static irqreturn_t ism_handle_irq(int irq, void *data) dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - smcd_handle_irq(ism->smcd, bit + ISM_DMB_BIT_OFFSET, dmbemask); + clt = clients[ism->sba_client_arr[bit]]; + clt->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); } if (ism->sba->e) { @@ -496,10 +578,21 @@ static const struct smcd_ops ism_ops = { .get_chid = ism_get_chid, }; +static void ism_dev_add_work_func(struct work_struct *work) +{ + struct ism_client *client = container_of(work, struct ism_client, + add_work); + + client->add(client->tgt_ism); + atomic_dec(&client->tgt_ism->add_dev_cnt); + wake_up(&client->tgt_ism->waitq); +} + static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - int ret; + unsigned long flags; + int i, ret; ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); if (ret <= 0) @@ -526,6 +619,28 @@ static int ism_dev_init(struct ism_dev *ism) /* hardware is V2 capable */ ism_create_system_eid(); + init_waitqueue_head(&ism->waitq); + atomic_set(&ism->free_clients_cnt, 0); + atomic_set(&ism->add_dev_cnt, 0); + + wait_event(ism->waitq, !atomic_read(&ism->add_dev_cnt)); + spin_lock_irqsave(&clients_lock, flags); + for (i = 0; i < max_client; ++i) + if (clients[i]) { + INIT_WORK(&clients[i]->add_work, + ism_dev_add_work_func); + clients[i]->tgt_ism = ism; + atomic_inc(&ism->add_dev_cnt); + schedule_work(&clients[i]->add_work); + } + spin_unlock_irqrestore(&clients_lock, flags); + + wait_event(ism->waitq, !atomic_read(&ism->add_dev_cnt)); + + mutex_lock(&ism_dev_list.mutex); + list_add(&ism->list, &ism_dev_list.list); + mutex_unlock(&ism_dev_list.mutex); + ret = smcd_register_dev(ism->smcd); if (ret) goto unreg_ieq; @@ -601,9 +716,36 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } +static void ism_dev_remove_work_func(struct work_struct *work) +{ + struct ism_client *client = container_of(work, struct ism_client, + remove_work); + + client->remove(client->tgt_ism); + atomic_dec(&client->tgt_ism->free_clients_cnt); + wake_up(&client->tgt_ism->waitq); +} + +/* Callers must hold ism_dev_list.mutex */ static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; + unsigned long flags; + int i; + + wait_event(ism->waitq, !atomic_read(&ism->free_clients_cnt)); + spin_lock_irqsave(&clients_lock, flags); + for (i = 0; i < max_client; ++i) + if (clients[i]) { + INIT_WORK(&clients[i]->remove_work, + ism_dev_remove_work_func); + clients[i]->tgt_ism = ism; + atomic_inc(&ism->free_clients_cnt); + schedule_work(&clients[i]->remove_work); + } + spin_unlock_irqrestore(&clients_lock, flags); + + wait_event(ism->waitq, !atomic_read(&ism->free_clients_cnt)); smcd_unregister_dev(ism->smcd); if (SYSTEM_EID.serial_number[0] != '0' || @@ -613,18 +755,22 @@ static void ism_dev_exit(struct ism_dev *ism) unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); pci_free_irq_vectors(pdev); + list_del_init(&ism->list); } static void ism_remove(struct pci_dev *pdev) { struct ism_dev *ism = dev_get_drvdata(&pdev->dev); + mutex_lock(&ism_dev_list.mutex); ism_dev_exit(ism); + mutex_unlock(&ism_dev_list.mutex); smcd_free_dev(ism->smcd); pci_clear_master(pdev); pci_release_mem_regions(pdev); pci_disable_device(pdev); + device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); kfree(ism); } @@ -644,6 +790,8 @@ static int __init ism_init(void) if (!ism_debug_info) return -ENODEV; + memset(clients, 0, sizeof(clients)); + max_client = 0; debug_register_view(ism_debug_info, &debug_hex_ascii_view); ret = pci_register_driver(&ism_driver); if (ret) @@ -654,6 +802,14 @@ static int __init ism_init(void) static void __exit ism_exit(void) { + struct ism_dev *ism; + + mutex_lock(&ism_dev_list.mutex); + list_for_each_entry(ism, &ism_dev_list.list, list) { + ism_dev_exit(ism); + } + mutex_unlock(&ism_dev_list.mutex); + pci_unregister_driver(&ism_driver); debug_unregister(ism_debug_info); } diff --git a/include/linux/ism.h b/include/linux/ism.h index 69bfbf0faaa1..55c8ad306928 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -9,6 +9,8 @@ #ifndef _ISM_H #define _ISM_H +#include + struct ism_dmb { u64 dmb_tok; u64 rgid; @@ -20,4 +22,69 @@ struct ism_dmb { dma_addr_t dma_addr; }; +/* Unless we gain unexpected popularity, this limit should hold for a while */ +#define MAX_CLIENTS 8 +#define ISM_NR_DMBS 1920 + +struct ism_dev { + spinlock_t lock; /* protects the ism device */ + struct list_head list; + struct pci_dev *pdev; + struct smcd_dev *smcd; + + struct ism_sba *sba; + dma_addr_t sba_dma_addr; + DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); + u8 *sba_client_arr; /* entries are indices into 'clients' array */ + void *priv[MAX_CLIENTS]; + + struct ism_eq *ieq; + dma_addr_t ieq_dma_addr; + + struct device dev; + u64 local_gid; + int ieq_idx; + + atomic_t free_clients_cnt; + atomic_t add_dev_cnt; + wait_queue_head_t waitq; +}; + +struct ism_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + +struct ism_client { + const char *name; + void (*add)(struct ism_dev *dev); + void (*remove)(struct ism_dev *dev); + void (*handle_event)(struct ism_dev *dev, struct ism_event *event); + /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent + * via ism_move_data(). Callback function must handle all active bits + * indicated by dmbemask. + */ + void (*handle_irq)(struct ism_dev *dev, unsigned int bit, u16 dmbemask); + /* Private area - don't touch! */ + struct work_struct remove_work; + struct work_struct add_work; + struct ism_dev *tgt_ism; + u8 id; +}; + +int ism_register_client(struct ism_client *client); +int ism_unregister_client(struct ism_client *client); +static inline void *ism_get_priv(struct ism_dev *dev, + struct ism_client *client) { + return dev->priv[client->id]; +} + +static inline void ism_set_priv(struct ism_dev *dev, struct ism_client *client, + void *priv) { + dev->priv[client->id] = priv; +} + #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index 421a7197b475..98689b16b841 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -11,6 +11,14 @@ #ifndef _SMC_H #define _SMC_H +#include +#include +#include +#include +#include "linux/ism.h" + +struct sock; + #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ struct smc_hashinfo { @@ -41,14 +49,6 @@ struct smcd_dmb { #define ISM_ERROR 0xFFFF -struct smcd_event { - u32 type; - u32 code; - u64 tok; - u64 time; - u64 info; -}; - struct smcd_dev; struct smcd_ops { @@ -93,6 +93,6 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, int smcd_register_dev(struct smcd_dev *smcd); void smcd_unregister_dev(struct smcd_dev *smcd); void smcd_free_dev(struct smcd_dev *smcd); -void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event); +void smcd_handle_event(struct smcd_dev *dev, struct ism_event *event); void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit, u16 dmbemask); #endif /* _SMC_H */ diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 71e38768fea2..39d3cf8cb221 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -295,7 +295,7 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct smcd_event event; + struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -489,7 +489,7 @@ EXPORT_SYMBOL_GPL(smcd_free_dev); * Context: * - Function called in IRQ context from ISM device driver event handler. */ -void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +void smcd_handle_event(struct smcd_dev *smcd, struct ism_event *event) { struct smc_ism_event_work *wrk; -- Gitee From af89909e263fcff08fafc71defe07a96baa6951c Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:49 +0100 Subject: [PATCH 06/21] net/smc: Register SMC-D as ISM client ANBZ: #5534 commit 8747716f3942a610efdd12e3655df47269c268ac upstream. Register the smc module with the new ism device driver API. This is the second part of a bigger overhaul of the interfaces between SMC and ISM. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller [fix the conflict to un-backport upstream patch] Signed-off-by: Wen Gu --- drivers/s390/net/ism_drv.c | 5 --- include/net/smc.h | 5 +-- net/smc/af_smc.c | 8 +++- net/smc/smc_core.c | 1 + net/smc/smc_ism.c | 82 +++++++++++++++++++++++++++----------- net/smc/smc_ism.h | 3 +- 6 files changed, 69 insertions(+), 35 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 0b6896dd6073..78ed4bdae11d 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -641,10 +641,6 @@ static int ism_dev_init(struct ism_dev *ism) list_add(&ism->list, &ism_dev_list.list); mutex_unlock(&ism_dev_list.mutex); - ret = smcd_register_dev(ism->smcd); - if (ret) - goto unreg_ieq; - query_info(ism); return 0; @@ -747,7 +743,6 @@ static void ism_dev_exit(struct ism_dev *ism) wait_event(ism->waitq, !atomic_read(&ism->free_clients_cnt)); - smcd_unregister_dev(ism->smcd); if (SYSTEM_EID.serial_number[0] != '0' || SYSTEM_EID.type[0] != '0') ism_del_vlan_id(ism->smcd, ISM_RESERVED_VLANID); diff --git a/include/net/smc.h b/include/net/smc.h index 98689b16b841..151aa54d9ad2 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -90,9 +90,6 @@ struct smcd_dev { struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, const struct smcd_ops *ops, int max_dmbs); -int smcd_register_dev(struct smcd_dev *smcd); -void smcd_unregister_dev(struct smcd_dev *smcd); void smcd_free_dev(struct smcd_dev *smcd); -void smcd_handle_event(struct smcd_dev *dev, struct ism_event *event); -void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit, u16 dmbemask); + #endif /* _SMC_H */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 130070425568..c3079514bdb5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3398,12 +3398,14 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; - smc_ism_init(); + rc = smc_ism_init(); + if (rc) + goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) - goto out_pernet_subsys_stat; + goto out_ism; rc = smc_pnet_init(); if (rc) @@ -3496,6 +3498,8 @@ static int __init smc_init(void) smc_pnet_exit(); out_nl: smc_nl_exit(); +out_ism: + smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8bc8a4f15a9c..5313fc9f3e47 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2610,6 +2610,7 @@ static int smc_core_reboot_event(struct notifier_block *this, { smc_lgrs_shutdown(); smc_ib_unregister_client(); + smc_ism_exit(); return 0; } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 39d3cf8cb221..611d4b90fadc 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -16,6 +16,7 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" +#include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -25,6 +26,20 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +static void smcd_register_dev(struct ism_dev *ism); +static void smcd_unregister_dev(struct ism_dev *ism); +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask); + +static struct ism_client smc_ism_client = { + .name = "SMC-D", + .add = smcd_register_dev, + .remove = smcd_unregister_dev, + .handle_event = smcd_handle_event, + .handle_irq = smcd_handle_irq, +}; + /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { @@ -408,8 +423,6 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -420,9 +433,25 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, } EXPORT_SYMBOL_GPL(smcd_alloc_dev); -int smcd_register_dev(struct smcd_dev *smcd) +void smcd_free_dev(struct smcd_dev *smcd) { - int rc; + put_device(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_free_dev); + +static void smcd_register_dev(struct ism_dev *ism) +{ + const struct smcd_ops *ops = NULL; + struct smcd_dev *smcd; + + smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + if (!smcd) + return; + smcd->priv = ism; + ism_set_priv(ism, &smc_ism_client, smcd); + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); mutex_lock(&smcd_dev_list.mutex); if (list_empty(&smcd_dev_list.list)) { @@ -446,19 +475,20 @@ int smcd_register_dev(struct smcd_dev *smcd) dev_name(&smcd->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); - rc = device_add(&smcd->dev); - if (rc) { + if (device_add(&smcd->dev)) { mutex_lock(&smcd_dev_list.mutex); list_del(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); + smcd_free_dev(smcd); } - return rc; + return; } -EXPORT_SYMBOL_GPL(smcd_register_dev); -void smcd_unregister_dev(struct smcd_dev *smcd) +static void smcd_unregister_dev(struct ism_dev *ism) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&smcd->dev)); smcd->going_away = 1; @@ -470,16 +500,9 @@ void smcd_unregister_dev(struct smcd_dev *smcd) device_del(&smcd->dev); } -EXPORT_SYMBOL_GPL(smcd_unregister_dev); - -void smcd_free_dev(struct smcd_dev *smcd) -{ - put_device(&smcd->dev); -} -EXPORT_SYMBOL_GPL(smcd_free_dev); /* SMCD Device event handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, + * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) @@ -489,8 +512,9 @@ EXPORT_SYMBOL_GPL(smcd_free_dev); * Context: * - Function called in IRQ context from ISM device driver event handler. */ -void smcd_handle_event(struct smcd_dev *smcd, struct ism_event *event) +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -504,17 +528,18 @@ void smcd_handle_event(struct smcd_dev *smcd, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -EXPORT_SYMBOL_GPL(smcd_handle_event); /* SMCD Device interrupt handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, DMB number, and the DMBE bitmask. + * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -524,10 +549,21 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -EXPORT_SYMBOL_GPL(smcd_handle_irq); -void __init smc_ism_init(void) +int smc_ism_init(void) { smc_ism_v2_capable = false; memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); +#if IS_ENABLED(CONFIG_ISM) + return ism_register_client(&smc_ism_client); +#else + return 0; +#endif +} + +void smc_ism_exit(void) +{ +#if IS_ENABLED(CONFIG_ISM) + ism_unregister_client(&smc_ism_client); +#endif } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index d6b2db604fe8..832b2f42d79f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -42,7 +42,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); -void smc_ism_init(void); +int smc_ism_init(void); +void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, -- Gitee From b6c631b494ef7f519c56654139d64572b3b3d6b2 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:50 +0100 Subject: [PATCH 07/21] net/smc: Separate SMC-D and ISM APIs ANBZ: #5534 commit 9de4df7b6be1cfca500f8ba21137d53eec45418a upstream. We separate the code implementing the struct smcd_ops API in the ISM device driver from the functions that may be used by other exploiters of ISM devices. Note: We start out small, and don't offer the whole breadth of the ISM device for public use, as many functions are specific to or likely only ever used in the context of SMC-D. This is the third part of a bigger overhaul of the interfaces between SMC and ISM. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism_drv.c | 92 ++++++++++++++++++++++++++------------ include/linux/ism.h | 7 +++ include/net/smc.h | 3 +- net/smc/smc_clc.c | 11 +++-- net/smc/smc_core.c | 6 ++- net/smc/smc_diag.c | 3 +- 6 files changed, 86 insertions(+), 36 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 78ed4bdae11d..718a2e201da9 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -273,10 +273,9 @@ static int ism_read_local_gid(struct ism_dev *ism) return ret; } -static int ism_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, +static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, u32 vid) { - struct ism_dev *ism = smcd->priv; union ism_query_rgid cmd; memset(&cmd, 0, sizeof(cmd)); @@ -290,6 +289,11 @@ static int ism_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, return ism_cmd(ism, &cmd); } +static int smcd_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, u32 vid) +{ + return ism_query_rgid(smcd->priv, rgid, vid_valid, vid); +} + static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -325,9 +329,9 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) return dmb->cpu_addr ? 0 : -ENOMEM; } -static int ism_register_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) +int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, + struct ism_client *client) { - struct ism_dev *ism = smcd->priv; union ism_reg_dmb cmd; int ret; @@ -352,18 +356,19 @@ static int ism_register_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) goto out; } dmb->dmb_tok = cmd.response.dmb_tok; + ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id; out: return ret; } +EXPORT_SYMBOL_GPL(ism_register_dmb); static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) { - return ism_register_dmb(smcd, (struct ism_dmb *)dmb); + return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, NULL); } -static int ism_unregister_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) +int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { - struct ism_dev *ism = smcd->priv; union ism_unreg_dmb cmd; int ret; @@ -373,6 +378,8 @@ static int ism_unregister_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) cmd.request.dmb_tok = dmb->dmb_tok; + ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT; + ret = ism_cmd(ism, &cmd); if (ret && ret != ISM_ERROR) goto out; @@ -381,15 +388,15 @@ static int ism_unregister_dmb(struct smcd_dev *smcd, struct ism_dmb *dmb) out: return ret; } +EXPORT_SYMBOL_GPL(ism_unregister_dmb); static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) { - return ism_unregister_dmb(smcd, (struct ism_dmb *)dmb); + return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); } -static int ism_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) +static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) { - struct ism_dev *ism = smcd->priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -401,9 +408,13 @@ static int ism_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) +static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) +{ + return ism_add_vlan_id(smcd->priv, vlan_id); +} + +static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) { - struct ism_dev *ism = smcd->priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -415,6 +426,11 @@ static int ism_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) return ism_cmd(ism, &cmd); } +static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) +{ + return ism_del_vlan_id(smcd->priv, vlan_id); +} + static int ism_set_vlan_required(struct smcd_dev *smcd) { return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); @@ -425,8 +441,8 @@ static int ism_reset_vlan_required(struct smcd_dev *smcd) return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); } -static int ism_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) +static int smcd_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info) { struct ism_dev *ism = smcd->priv; union ism_sig_ieq cmd; @@ -449,8 +465,9 @@ static unsigned int max_bytes(unsigned int start, unsigned int len, return min(boundary - (start & (boundary - 1)), len); } -static int ism_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, unsigned int size) +static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size) { struct ism_dev *ism = smcd->priv; unsigned int bytes; @@ -494,14 +511,15 @@ static void ism_create_system_eid(void) memcpy(&SYSTEM_EID.type, tmp, 4); } -static u8 *ism_get_system_eid(void) +u8 *ism_get_seid(void) { return SYSTEM_EID.seid_string; } +EXPORT_SYMBOL_GPL(ism_get_seid); -static u16 ism_get_chid(struct smcd_dev *smcd) +static u16 smcd_get_chid(struct smcd_dev *smcd) { - struct ism_dev *ism = (struct ism_dev *)smcd->priv; + struct ism_dev *ism = smcd->priv; if (!ism || !ism->pdev) return 0; @@ -564,18 +582,26 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } +static u64 smcd_get_local_gid(struct smcd_dev *smcd) +{ + struct ism_dev *ism = smcd->priv; + + return ism->local_gid; +} + static const struct smcd_ops ism_ops = { - .query_remote_gid = ism_query_rgid, + .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, - .add_vlan_id = ism_add_vlan_id, - .del_vlan_id = ism_del_vlan_id, + .add_vlan_id = smcd_add_vlan_id, + .del_vlan_id = smcd_del_vlan_id, .set_vlan_required = ism_set_vlan_required, .reset_vlan_required = ism_reset_vlan_required, - .signal_event = ism_signal_ieq, - .move_data = ism_move, - .get_system_eid = ism_get_system_eid, - .get_chid = ism_get_chid, + .signal_event = smcd_signal_ieq, + .move_data = smcd_move, + .get_system_eid = ism_get_seid, + .get_local_gid = smcd_get_local_gid, + .get_chid = smcd_get_chid, }; static void ism_dev_add_work_func(struct work_struct *work) @@ -598,10 +624,15 @@ static int ism_dev_init(struct ism_dev *ism) if (ret <= 0) goto out; + ism->sba_client_arr = kzalloc(ISM_NR_DMBS, GFP_KERNEL); + if (!ism->sba_client_arr) + goto free_vectors; + memset(ism->sba_client_arr, NO_CLIENT, ISM_NR_DMBS); + ret = request_irq(pci_irq_vector(pdev, 0), ism_handle_irq, 0, pci_name(pdev), ism); if (ret) - goto free_vectors; + goto free_client_arr; ret = register_sba(ism); if (ret) @@ -615,7 +646,7 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_ieq; - if (!ism_add_vlan_id(ism->smcd, ISM_RESERVED_VLANID)) + if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) /* hardware is V2 capable */ ism_create_system_eid(); @@ -650,6 +681,8 @@ static int ism_dev_init(struct ism_dev *ism) unregister_sba(ism); free_irq: free_irq(pci_irq_vector(pdev, 0), ism); +free_client_arr: + kfree(ism->sba_client_arr); free_vectors: pci_free_irq_vectors(pdev); out: @@ -745,10 +778,11 @@ static void ism_dev_exit(struct ism_dev *ism) if (SYSTEM_EID.serial_number[0] != '0' || SYSTEM_EID.type[0] != '0') - ism_del_vlan_id(ism->smcd, ISM_RESERVED_VLANID); + ism_del_vlan_id(ism, ISM_RESERVED_VLANID); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); + kfree(ism->sba_client_arr); pci_free_irq_vectors(pdev); list_del_init(&ism->list); } diff --git a/include/linux/ism.h b/include/linux/ism.h index 55c8ad306928..bdd29e08d4fe 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -87,4 +87,11 @@ static inline void ism_set_priv(struct ism_dev *dev, struct ism_client *client, dev->priv[client->id] = priv; } +int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, + struct ism_client *client); +int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); +int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, + unsigned int offset, void *data, unsigned int size); +u8 *ism_get_seid(void); + #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index 151aa54d9ad2..d5f8f18169d7 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -66,14 +66,15 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); u8* (*get_system_eid)(void); + u64 (*get_local_gid)(struct smcd_dev *dev); u16 (*get_chid)(struct smcd_dev *dev); }; struct smcd_dev { const struct smcd_ops *ops; struct device dev; + struct ism_dev *ism; void *priv; - u64 local_gid; struct list_head list; spinlock_t lock; struct smc_connection **conn; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1472f31480d8..892502fc114c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -813,6 +813,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; + struct smcd_dev *smcd; int len, i, plen, rc; int reason_code = 0; struct kvec vec[8]; @@ -868,7 +869,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (smcd_indicated(ini->smc_type_v1)) { /* add SMC-D specifics */ if (ini->ism_dev[0]) { - pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid); + smcd = ini->ism_dev[0]; + pclc_smcd->ism.gid = + htonll(smcd->ops->get_local_gid(smcd)); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } @@ -914,8 +917,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { + smcd = ini->ism_dev[i]; gidchids[i - 1].gid = - htonll(ini->ism_dev[i]->local_gid); + htonll(smcd->ops->get_local_gid(smcd)); gidchids[i - 1].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); } @@ -1000,7 +1004,8 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_D; - clc->d0.gid = conn->lgr->smcd->local_gid; + clc->d0.gid = + conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd); clc->d0.token = conn->rmb_desc->token; clc->d0.dmbe_size = conn->rmbe_size_short; clc->d0.dmbe_idx = 0; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5313fc9f3e47..d8d019592ee2 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -497,6 +497,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smcd_dev *smcd = lgr->smcd; struct nlattr *attrs; void *nlh; @@ -512,8 +513,9 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, - SMC_NLA_LGR_D_PAD)) + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, + smcd->ops->get_local_gid(smcd), + SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, SMC_NLA_LGR_D_PAD)) diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 22d38206ed48..ca57c68d3b9f 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -167,12 +167,13 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid; - dinfo.my_gid = conn->lgr->smcd->local_gid; + dinfo.my_gid = smcd->ops->get_local_gid(smcd); dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; -- Gitee From 8ed1c63bd783b044c3db275e5b16e7ceb222e5d4 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:51 +0100 Subject: [PATCH 08/21] s390/ism: Consolidate SMC-D-related code ANBZ: #5534 commit 820f21009f1bc7a69e28752f6c6d9544401ca526 upstream. The ism module had SMC-D-specific code sprinkled across the entire module. We are now consolidating the SMC-D-specific parts into the latter parts of the module, so it becomes more clear what code is intended for use with ISM, and which parts are glue code for usage in the context of SMC-D. This is the fourth part of a bigger overhaul of the interfaces between SMC and ISM. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism_drv.c | 162 ++++++++++++++++++++++--------------- include/linux/ism.h | 2 + include/net/smc.h | 5 +- net/smc/smc_ism.c | 63 +++++++++------ 4 files changed, 143 insertions(+), 89 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 718a2e201da9..e7740135a427 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -289,11 +289,6 @@ static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, return ism_cmd(ism, &cmd); } -static int smcd_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, u32 vid) -{ - return ism_query_rgid(smcd->priv, rgid, vid_valid, vid); -} - static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -362,11 +357,6 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, } EXPORT_SYMBOL_GPL(ism_register_dmb); -static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, NULL); -} - int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { union ism_unreg_dmb cmd; @@ -390,11 +380,6 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) } EXPORT_SYMBOL_GPL(ism_unregister_dmb); -static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); -} - static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) { union ism_set_vlan_id cmd; @@ -408,11 +393,6 @@ static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_add_vlan_id(smcd->priv, vlan_id); -} - static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) { union ism_set_vlan_id cmd; @@ -426,25 +406,9 @@ static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_del_vlan_id(smcd->priv, vlan_id); -} - -static int ism_set_vlan_required(struct smcd_dev *smcd) +static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info) { - return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); -} - -static int ism_reset_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); -} - -static int smcd_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - struct ism_dev *ism = smcd->priv; union ism_sig_ieq cmd; memset(&cmd, 0, sizeof(cmd)); @@ -465,11 +429,9 @@ static unsigned int max_bytes(unsigned int start, unsigned int len, return min(boundary - (start & (boundary - 1)), len); } -static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size) +int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, + unsigned int offset, void *data, unsigned int size) { - struct ism_dev *ism = smcd->priv; unsigned int bytes; u64 dmb_req; int ret; @@ -490,6 +452,7 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, return 0; } +EXPORT_SYMBOL_GPL(ism_move); static struct ism_systemeid SYSTEM_EID = { .seid_string = "IBM-SYSZ-ISMSEID00000000", @@ -517,10 +480,8 @@ u8 *ism_get_seid(void) } EXPORT_SYMBOL_GPL(ism_get_seid); -static u16 smcd_get_chid(struct smcd_dev *smcd) +static u16 ism_get_chid(struct ism_dev *ism) { - struct ism_dev *ism = smcd->priv; - if (!ism || !ism->pdev) return 0; @@ -582,28 +543,11 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } -static u64 smcd_get_local_gid(struct smcd_dev *smcd) +static u64 ism_get_local_gid(struct ism_dev *ism) { - struct ism_dev *ism = smcd->priv; - return ism->local_gid; } -static const struct smcd_ops ism_ops = { - .query_remote_gid = smcd_query_rgid, - .register_dmb = smcd_register_dmb, - .unregister_dmb = smcd_unregister_dmb, - .add_vlan_id = smcd_add_vlan_id, - .del_vlan_id = smcd_del_vlan_id, - .set_vlan_required = ism_set_vlan_required, - .reset_vlan_required = ism_reset_vlan_required, - .signal_event = smcd_signal_ieq, - .move_data = smcd_move, - .get_system_eid = ism_get_seid, - .get_local_gid = smcd_get_local_gid, - .get_chid = smcd_get_chid, -}; - static void ism_dev_add_work_func(struct work_struct *work) { struct ism_client *client = container_of(work, struct ism_client, @@ -845,3 +789,95 @@ static void __exit ism_exit(void) module_init(ism_init); module_exit(ism_exit); + +/*************************** SMC-D Implementation *****************************/ + +#if IS_ENABLED(CONFIG_SMC) +static int smcd_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, + u32 vid) +{ + return ism_query_rgid(smcd->priv, rgid, vid_valid, vid); +} + +static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, + struct ism_client *client) +{ + return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client); +} + +static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); +} + +static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) +{ + return ism_add_vlan_id(smcd->priv, vlan_id); +} + +static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) +{ + return ism_del_vlan_id(smcd->priv, vlan_id); +} + +static int smcd_set_vlan_required(struct smcd_dev *smcd) +{ + return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); +} + +static int smcd_reset_vlan_required(struct smcd_dev *smcd) +{ + return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); +} + +static int smcd_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info) +{ + return ism_signal_ieq(smcd->priv, rgid, trigger_irq, event_code, info); +} + +static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size) +{ + return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); +} + +static u64 smcd_get_local_gid(struct smcd_dev *smcd) +{ + return ism_get_local_gid(smcd->priv); +} + +static u16 smcd_get_chid(struct smcd_dev *smcd) +{ + return ism_get_chid(smcd->priv); +} + +static inline struct device *smcd_get_dev(struct smcd_dev *dev) +{ + struct ism_dev *ism = dev->priv; + + return &ism->dev; +} + +static const struct smcd_ops ism_ops = { + .query_remote_gid = smcd_query_rgid, + .register_dmb = smcd_register_dmb, + .unregister_dmb = smcd_unregister_dmb, + .add_vlan_id = smcd_add_vlan_id, + .del_vlan_id = smcd_del_vlan_id, + .set_vlan_required = smcd_set_vlan_required, + .reset_vlan_required = smcd_reset_vlan_required, + .signal_event = smcd_signal_ieq, + .move_data = smcd_move, + .get_system_eid = ism_get_seid, + .get_local_gid = smcd_get_local_gid, + .get_chid = smcd_get_chid, +}; + +const struct smcd_ops *ism_get_smcd_ops(void) +{ + return &ism_ops; +} +EXPORT_SYMBOL_GPL(ism_get_smcd_ops); +#endif diff --git a/include/linux/ism.h b/include/linux/ism.h index bdd29e08d4fe..104ce2fd503a 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -94,4 +94,6 @@ int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); u8 *ism_get_seid(void); +const struct smcd_ops *ism_get_smcd_ops(void); + #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index d5f8f18169d7..556b96c12279 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -50,11 +50,13 @@ struct smcd_dmb { #define ISM_ERROR 0xFFFF struct smcd_dev; +struct ism_client; struct smcd_ops { int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, u32 vid); - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, + struct ism_client *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); @@ -73,7 +75,6 @@ struct smcd_ops { struct smcd_dev { const struct smcd_ops *ops; struct device dev; - struct ism_dev *ism; void *priv; struct list_head list; spinlock_t lock; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 611d4b90fadc..b796532533b7 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -26,6 +26,7 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +#if IS_ENABLED(CONFIG_ISM) static void smcd_register_dev(struct ism_dev *ism); static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); @@ -39,6 +40,7 @@ static struct ism_client smc_ism_client = { .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; +#endif /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) @@ -197,6 +199,7 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { +#if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; @@ -205,7 +208,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; @@ -214,6 +217,9 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb_desc->len = dmb.dmb_len; } return rc; +#else + return 0; +#endif } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -307,6 +313,7 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; @@ -350,24 +357,6 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) } } -int smc_ism_signal_shutdown(struct smc_link_group *lgr) -{ - int rc; - union smcd_sw_event_info ev_info; - - if (lgr->peer_shutdown) - return 0; - - memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); - ev_info.vlan_id = lgr->vlan_id; - ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_SHUTDOWN, - ev_info.info); - return rc; -} - /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { @@ -441,9 +430,12 @@ EXPORT_SYMBOL_GPL(smcd_free_dev); static void smcd_register_dev(struct ism_dev *ism) { - const struct smcd_ops *ops = NULL; + const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd; + if (!ops) + return; + smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, ISM_NR_DMBS); if (!smcd) @@ -549,16 +541,39 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } +#endif + +int smc_ism_signal_shutdown(struct smc_link_group *lgr) +{ + int rc = 0; +#if IS_ENABLED(CONFIG_ISM) + union smcd_sw_event_info ev_info; + + if (lgr->peer_shutdown) + return 0; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); +#endif + return rc; +} int smc_ism_init(void) { + int rc = 0; + +#if IS_ENABLED(CONFIG_ISM) smc_ism_v2_capable = false; memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); -#if IS_ENABLED(CONFIG_ISM) - return ism_register_client(&smc_ism_client); -#else - return 0; + + rc = ism_register_client(&smc_ism_client); #endif + return rc; } void smc_ism_exit(void) -- Gitee From f908ae19cfbf4b5056ebdef26839475153335e8f Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jan 2023 19:17:52 +0100 Subject: [PATCH 09/21] net/smc: De-tangle ism and smc device initialization ANBZ: #5534 commit 8c81ba20349daf9f7e58bb05a0c12f4b71813a30 upstream. The struct device for ISM devices was part of struct smcd_dev. Move to struct ism_dev, provide a new API call in struct smcd_ops, and convert existing SMCD code accordingly. Furthermore, remove struct smcd_dev from struct ism_dev. This is the final part of a bigger overhaul of the interfaces between SMC and ISM. Signed-off-by: Stefan Raspl Signed-off-by: Jan Karcher Signed-off-by: Wenjia Zhang Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism_drv.c | 25 +++++++++-------- include/linux/ism.h | 1 - include/net/smc.h | 6 +---- net/smc/af_smc.c | 1 + net/smc/smc_core.c | 6 +++-- net/smc/smc_ism.c | 55 +++++++++----------------------------- net/smc/smc_pnet.c | 40 ++++++++++++++------------- 7 files changed, 52 insertions(+), 82 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e7740135a427..2dfa1831e95b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -645,6 +645,12 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&ism->lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; + ism->dev.parent = &pdev->dev; + device_initialize(&ism->dev); + dev_set_name(&ism->dev, dev_name(&pdev->dev)); + ret = device_add(&ism->dev); + if (ret) + goto err_dev; ret = pci_enable_device_mem(pdev); if (ret) @@ -662,30 +668,23 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dma_set_max_seg_size(&pdev->dev, SZ_1M); pci_set_master(pdev); - ism->smcd = smcd_alloc_dev(&pdev->dev, dev_name(&pdev->dev), &ism_ops, - ISM_NR_DMBS); - if (!ism->smcd) { - ret = -ENOMEM; - goto err_resource; - } - - ism->smcd->priv = ism; ret = ism_dev_init(ism); if (ret) - goto err_free; + goto err_resource; return 0; -err_free: - smcd_free_dev(ism->smcd); err_resource: pci_clear_master(pdev); pci_release_mem_regions(pdev); err_disable: pci_disable_device(pdev); err: - kfree(ism); + device_del(&ism->dev); +err_dev: dev_set_drvdata(&pdev->dev, NULL); + kfree(ism); + return ret; } @@ -739,7 +738,6 @@ static void ism_remove(struct pci_dev *pdev) ism_dev_exit(ism); mutex_unlock(&ism_dev_list.mutex); - smcd_free_dev(ism->smcd); pci_clear_master(pdev); pci_release_mem_regions(pdev); pci_disable_device(pdev); @@ -873,6 +871,7 @@ static const struct smcd_ops ism_ops = { .get_system_eid = ism_get_seid, .get_local_gid = smcd_get_local_gid, .get_chid = smcd_get_chid, + .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/ism.h b/include/linux/ism.h index 104ce2fd503a..ea2bcdae7401 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -30,7 +30,6 @@ struct ism_dev { spinlock_t lock; /* protects the ism device */ struct list_head list; struct pci_dev *pdev; - struct smcd_dev *smcd; struct ism_sba *sba; dma_addr_t sba_dma_addr; diff --git a/include/net/smc.h b/include/net/smc.h index 556b96c12279..597cb9381182 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -70,11 +70,11 @@ struct smcd_ops { u8* (*get_system_eid)(void); u64 (*get_local_gid)(struct smcd_dev *dev); u16 (*get_chid)(struct smcd_dev *dev); + struct device* (*get_dev)(struct smcd_dev *dev); }; struct smcd_dev { const struct smcd_ops *ops; - struct device dev; void *priv; struct list_head list; spinlock_t lock; @@ -90,8 +90,4 @@ struct smcd_dev { u8 going_away : 1; }; -struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs); -void smcd_free_dev(struct smcd_dev *smcd); - #endif /* _SMC_H */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c3079514bdb5..9f95bfeccd3f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3515,6 +3515,7 @@ static void __exit smc_exit(void) sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); + smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d8d019592ee2..32640f3b21bf 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -835,6 +835,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; + struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; @@ -881,7 +882,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ - get_device(&ini->ism_dev[ini->ism_selected]->dev); + smcd = ini->ism_dev[ini->ism_selected]; + get_device(smcd->ops->get_dev(smcd)); lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; @@ -1402,7 +1404,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); + put_device(lgr->smcd->ops->get_dev(lgr->smcd)); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index b796532533b7..17336529d840 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -230,9 +230,11 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; + struct ism_dev *ism; int use_cnt = 0; void *nlh; + ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -247,7 +249,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -376,41 +378,24 @@ static void smc_ism_event_work(struct work_struct *work) kfree(wrk); } -static void smcd_release(struct device *dev) -{ - struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); - - kfree(smcd->conn); - kfree(smcd); -} - -struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) { - kfree(smcd); + smcd->conn = devm_kcalloc(parent, max_dmbs, + sizeof(struct smc_connection *), GFP_KERNEL); + if (!smcd->conn) return NULL; - } smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); - if (!smcd->event_wq) { - kfree(smcd->conn); - kfree(smcd); + if (!smcd->event_wq) return NULL; - } - smcd->dev.parent = parent; - smcd->dev.release = smcd_release; - device_initialize(&smcd->dev); - dev_set_name(&smcd->dev, name); smcd->ops = ops; spin_lock_init(&smcd->lock); @@ -420,13 +405,6 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } -EXPORT_SYMBOL_GPL(smcd_alloc_dev); - -void smcd_free_dev(struct smcd_dev *smcd) -{ - put_device(&smcd->dev); -} -EXPORT_SYMBOL_GPL(smcd_free_dev); static void smcd_register_dev(struct ism_dev *ism) { @@ -464,16 +442,9 @@ static void smcd_register_dev(struct ism_dev *ism) mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&smcd->dev), smcd->pnetid, + dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); - if (device_add(&smcd->dev)) { - mutex_lock(&smcd_dev_list.mutex); - list_del(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - smcd_free_dev(smcd); - } - return; } @@ -482,15 +453,13 @@ static void smcd_unregister_dev(struct ism_dev *ism) struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&smcd->dev)); + dev_name(&ism->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); - - device_del(&smcd->dev); } /* SMCD Device event handler. Called from ISM device interrupt handler. diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 1ed4bbccaf31..76372da9d742 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -102,7 +102,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; @@ -160,16 +160,17 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user && + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->pnetid_by_user && (!pnet_name || - smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); - memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = false; + "%.16s\n", + dev_name(smcd->ops->get_dev(smcd)), + smcd->pnetid); + memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); + smcd->pnetid_by_user = false; rc = 0; } } @@ -329,8 +330,8 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(&smcd_dev->dev), smcd_name, - IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } smcd_dev = NULL; @@ -407,7 +408,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; + struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -421,14 +423,16 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, ib_port, ib_dev->pnetid[ib_port - 1]); } - smcd_dev = smc_pnet_find_smcd(ib_name); - if (smcd_dev) { - smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); - if (smcddev_applied) + smcd = smc_pnet_find_smcd(ib_name); + if (smcd) { + smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); + if (smcddev_applied) { + dev = smcd->ops->get_dev(smcd); pr_warn_ratelimited("smc: smcd device %s " "applied user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); + "%.16s\n", dev_name(dev), + smcd->pnetid); + } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. @@ -1176,7 +1180,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(&smcddev->dev); + const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; -- Gitee From eecc76c30231244dda9ce050f5e6980bcf7f3037 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jan 2023 23:14:21 -0800 Subject: [PATCH 10/21] net: add missing includes of linux/splice.h ANBZ: #5534 commit 462502ff9acb7bb02405e3e486428472db7c48dc upstream. Number of files depend on linux/splice.h getting included by linux/skbuff.h which soon will no longer be the case. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/af_smc.c | 1 + net/smc/smc_rx.c | 1 + net/unix/af_unix.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9f95bfeccd3f..7417c5fa2979 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 17c5aee7ee4f..5a50c159e747 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -13,6 +13,7 @@ #include #include #include +#include #include diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 28721e9575b7..429345f34905 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -111,6 +111,7 @@ #include #include #include +#include #include #include -- Gitee From 861c45ee33d7f860f58c35950f3a290a071d683a Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Tue, 7 Mar 2023 11:23:46 +0800 Subject: [PATCH 11/21] net/smc: fix fallback failed while sendmsg with fastopen ANBZ: #5534 commit ce7ca794712f186da99719e8b4e97bd5ddbb04c3 upstream. Before determining whether the msg has unsupported options, it has been prematurely terminated by the wrong status check. For the application, the general usages of MSG_FASTOPEN likes fd = socket(...) /* rather than connect */ sendto(fd, data, len, MSG_FASTOPEN) Hence, We need to check the flag before state check, because the sock state here is always SMC_INIT when applications tries MSG_FASTOPEN. Once we found unsupported options, fallback it to TCP. Fixes: ee9dfbef02d1 ("net/smc: handle sockopts forcing fallback") Signed-off-by: D. Wythe Signed-off-by: Simon Horman v2 -> v1: Optimize code style Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/af_smc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7417c5fa2979..1c5787e0955a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2659,16 +2659,14 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = -EPIPE; + int rc; smc = smc_sk(sk); lock_sock(sk); - if ((sk->sk_state != SMC_ACTIVE) && - (sk->sk_state != SMC_APPCLOSEWAIT1) && - (sk->sk_state != SMC_INIT)) - goto out; + /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { + /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); if (rc) @@ -2677,6 +2675,11 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rc = -EINVAL; goto out; } + } else if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) { + rc = -EPIPE; + goto out; } if (smc->use_fallback) { -- Gitee From 52b7361b5312bd06ec3ce2a0526b2c01fa40d496 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 8 Mar 2023 16:17:12 +0800 Subject: [PATCH 12/21] net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler() ANBZ: #5534 commit 22a825c541d775c1dbe7b2402786025acad6727b upstream. When performing a stress test on SMC-R by rmmod mlx5_ib driver during the wrk/nginx test, we found that there is a probability of triggering a panic while terminating all link groups. This issue dues to the race between smc_smcr_terminate_all() and smc_buf_create(). smc_smcr_terminate_all smc_buf_create /* init */ conn->sndbuf_desc = NULL; ... __smc_lgr_terminate smc_conn_kill smc_close_abort smc_cdc_get_slot_and_msg_send __softirqentry_text_start smc_wr_tx_process_cqe smc_cdc_tx_handler READ(conn->sndbuf_desc->len); /* panic dues to NULL sndbuf_desc */ conn->sndbuf_desc = xxx; This patch tries to fix the issue by always to check the sndbuf_desc before send any cdc msg, to make sure that no null pointer is seen during cqe processing. Fixes: 0b29ec643613 ("net/smc: immediate termination for SMCR link groups") Signed-off-by: D. Wythe Reviewed-by: Tony Lu Reviewed-by: Wenjia Zhang Link: https://lore.kernel.org/r/1678263432-17329-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski Signed-off-by: Wen Gu --- net/smc/smc_cdc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 53f63bfbaf5f..89105e95b452 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -114,6 +114,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, union smc_host_cursor cfed; int rc; + if (unlikely(!READ_ONCE(conn->sndbuf_desc))) + return -ENOBUFS; + smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; -- Gitee From 3fa48a76f2a936e1e3d372a0f4dbe645237c0f16 Mon Sep 17 00:00:00 2001 From: Wenjia Zhang Date: Mon, 13 Mar 2023 11:08:28 +0100 Subject: [PATCH 13/21] net/smc: fix deadlock triggered by cancel_delayed_work_syn() ANBZ: #5534 commit 13085e1b5cab8ad802904d72e6a6dae85ae0cd20 upstream. The following LOCKDEP was detected: Workqueue: events smc_lgr_free_work [smc] WARNING: possible circular locking dependency detected 6.1.0-20221027.rc2.git8.56bc5b569087.300.fc36.s390x+debug #1 Not tainted ------------------------------------------------------ kworker/3:0/176251 is trying to acquire lock: 00000000f1467148 ((wq_completion)smc_tx_wq-00000000#2){+.+.}-{0:0}, at: __flush_workqueue+0x7a/0x4f0 but task is already holding lock: 0000037fffe97dc8 ((work_completion)(&(&lgr->free_work)->work)){+.+.}-{0:0}, at: process_one_work+0x232/0x730 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 ((work_completion)(&(&lgr->free_work)->work)){+.+.}-{0:0}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 __flush_work+0x76/0xf0 __cancel_work_timer+0x170/0x220 __smc_lgr_terminate.part.0+0x34/0x1c0 [smc] smc_connect_rdma+0x15e/0x418 [smc] __smc_connect+0x234/0x480 [smc] smc_connect+0x1d6/0x230 [smc] __sys_connect+0x90/0xc0 __do_sys_socketcall+0x186/0x370 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #3 (smc_client_lgr_pending){+.+.}-{3:3}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 __mutex_lock+0x96/0x8e8 mutex_lock_nested+0x32/0x40 smc_connect_rdma+0xa4/0x418 [smc] __smc_connect+0x234/0x480 [smc] smc_connect+0x1d6/0x230 [smc] __sys_connect+0x90/0xc0 __do_sys_socketcall+0x186/0x370 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #2 (sk_lock-AF_SMC){+.+.}-{0:0}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 lock_sock_nested+0x46/0xa8 smc_tx_work+0x34/0x50 [smc] process_one_work+0x30c/0x730 worker_thread+0x62/0x420 kthread+0x138/0x150 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 -> #1 ((work_completion)(&(&smc->conn.tx_work)->work)){+.+.}-{0:0}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 process_one_work+0x2bc/0x730 worker_thread+0x62/0x420 kthread+0x138/0x150 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 -> #0 ((wq_completion)smc_tx_wq-00000000#2){+.+.}-{0:0}: check_prev_add+0xd8/0xe88 validate_chain+0x70c/0xb20 __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 __flush_workqueue+0xaa/0x4f0 drain_workqueue+0xaa/0x158 destroy_workqueue+0x44/0x2d8 smc_lgr_free+0x9e/0xf8 [smc] process_one_work+0x30c/0x730 worker_thread+0x62/0x420 kthread+0x138/0x150 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 other info that might help us debug this: Chain exists of: (wq_completion)smc_tx_wq-00000000#2 --> smc_client_lgr_pending --> (work_completion)(&(&lgr->free_work)->work) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&(&lgr->free_work)->work)); lock(smc_client_lgr_pending); lock((work_completion) (&(&lgr->free_work)->work)); lock((wq_completion)smc_tx_wq-00000000#2); *** DEADLOCK *** 2 locks held by kworker/3:0/176251: #0: 0000000080183548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x232/0x730 #1: 0000037fffe97dc8 ((work_completion) (&(&lgr->free_work)->work)){+.+.}-{0:0}, at: process_one_work+0x232/0x730 stack backtrace: CPU: 3 PID: 176251 Comm: kworker/3:0 Not tainted Hardware name: IBM 8561 T01 701 (z/VM 7.2.0) Call Trace: [<000000002983c3e4>] dump_stack_lvl+0xac/0x100 [<0000000028b477ae>] check_noncircular+0x13e/0x160 [<0000000028b48808>] check_prev_add+0xd8/0xe88 [<0000000028b49cc4>] validate_chain+0x70c/0xb20 [<0000000028b4bd26>] __lock_acquire+0x58e/0xbd8 [<0000000028b4cf6a>] lock_acquire.part.0+0xe2/0x248 [<0000000028b4d17c>] lock_acquire+0xac/0x1c8 [<0000000028addaaa>] __flush_workqueue+0xaa/0x4f0 [<0000000028addf9a>] drain_workqueue+0xaa/0x158 [<0000000028ae303c>] destroy_workqueue+0x44/0x2d8 [<000003ff8029af26>] smc_lgr_free+0x9e/0xf8 [smc] [<0000000028adf3d4>] process_one_work+0x30c/0x730 [<0000000028adf85a>] worker_thread+0x62/0x420 [<0000000028aeac50>] kthread+0x138/0x150 [<0000000028a63914>] __ret_from_fork+0x3c/0x58 [<00000000298503da>] ret_from_fork+0xa/0x40 INFO: lockdep is turned off. =================================================================== This deadlock occurs because cancel_delayed_work_sync() waits for the work(&lgr->free_work) to finish, while the &lgr->free_work waits for the work(lgr->tx_wq), which needs the sk_lock-AF_SMC, that is already used under the mutex_lock. The solution is to use cancel_delayed_work() instead, which kills off a pending work. Fixes: a52bcc919b14 ("net/smc: improve termination processing") Signed-off-by: Wenjia Zhang Reviewed-by: Jan Karcher Reviewed-by: Karsten Graul Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 32640f3b21bf..c2aba418785d 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1479,7 +1479,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ - cancel_delayed_work_sync(&lgr->free_work); + cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ -- Gitee From 3c4827e492abc72ff1d083d431ce8c3c08494cb6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 13 Mar 2023 11:08:29 +0100 Subject: [PATCH 14/21] net/smc: Fix device de-init sequence ANBZ: #5534 commit 9d876d3ef27fa84355597ad269939772192356d8 upstream. CLC message initialization was not properly reversed in error handling path. Reported-and-suggested-by: Alexander Gordeev Signed-off-by: Stefan Raspl Signed-off-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/af_smc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1c5787e0955a..0b89a37a1a0d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3503,6 +3503,7 @@ static int __init smc_init(void) out_nl: smc_nl_exit(); out_ism: + smc_clc_exit(); smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); -- Gitee From 52aadc5a08c79b25e6ff2bcd993ec8e001890df4 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 13 Mar 2023 11:10:31 +0100 Subject: [PATCH 15/21] net/smc: Introduce explicit check for v2 support ANBZ: #5534 commit f947568e258038d3c2f8f38a9a7dabaca36643ec upstream. Previously, v2 support was derived from a very specific format of the SEID as part of the SMC-D codebase. Make this part of the SMC-D device API, so implementers do not need to adhere to a specific SEID format. Signed-off-by: Stefan Raspl Reviewed-and-tested-by: Jan Karcher Reviewed-by: Wenjia Zhang Signed-off-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- drivers/s390/net/ism_drv.c | 7 +++++++ include/net/smc.h | 1 + net/smc/smc_ism.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2dfa1831e95b..57db80f59a51 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -841,6 +841,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); } +static int smcd_supports_v2(void) +{ + return SYSTEM_EID.serial_number[0] != '0' || + SYSTEM_EID.type[0] != '0'; +} + static u64 smcd_get_local_gid(struct smcd_dev *smcd) { return ism_get_local_gid(smcd->priv); @@ -868,6 +874,7 @@ static const struct smcd_ops ism_ops = { .reset_vlan_required = smcd_reset_vlan_required, .signal_event = smcd_signal_ieq, .move_data = smcd_move, + .supports_v2 = smcd_supports_v2, .get_system_eid = ism_get_seid, .get_local_gid = smcd_get_local_gid, .get_chid = smcd_get_chid, diff --git a/include/net/smc.h b/include/net/smc.h index 597cb9381182..a002552be29c 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -67,6 +67,7 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); + int (*supports_v2)(void); u8* (*get_system_eid)(void); u64 (*get_local_gid)(struct smcd_dev *dev); u16 (*get_chid)(struct smcd_dev *dev); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 17336529d840..a1e191a323d7 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -428,7 +428,7 @@ static void smcd_register_dev(struct ism_dev *ism) u8 *system_eid = NULL; system_eid = smcd->ops->get_system_eid(); - if (system_eid[24] != '0' || system_eid[28] != '0') { + if (smcd->ops->supports_v2()) { smc_ism_v2_capable = true; memcpy(smc_ism_v2_system_eid, system_eid, SMC_MAX_EID_LEN); -- Gitee From 9ad1e3fb355bdabb11da596c706f595ff2515598 Mon Sep 17 00:00:00 2001 From: Kai Shen Date: Fri, 17 Mar 2023 03:21:32 +0000 Subject: [PATCH 16/21] net/smc: Use percpu ref for wr tx reference ANBZ: #5534 commit 79a22238b4f22c45cadd3b4040d644f4de320d1b upstream. The refcount wr_tx_refcnt may cause cache thrashing problems among cores and we can use percpu ref to mitigate this issue here. We gain some performance improvement with percpu ref here on our customized smc-r verion. Applying cache alignment may also mitigate this problem but it seem more reasonable to use percpu ref here. We can also replace wr_reg_refcnt with one percpu reference like wr_tx_refcnt. redis-benchmark on smc-r with atomic wr_tx_refcnt: SET: 525707.06 requests per second, p50=0.087 msec GET: 554877.38 requests per second, p50=0.087 msec redis-benchmark on the percpu_ref version: SET: 540482.06 requests per second, p50=0.087 msec GET: 570711.12 requests per second, p50=0.079 msec Cases are like "redis-benchmark -h x.x.x.x -q -t set,get -P 1 -n 5000000 -c 50 -d 10 --threads 4". Signed-off-by: Kai Shen Reviewed-by: Tony Lu Signed-off-by: David S. Miller [fix the conflict to un-backport upstream patch] Signed-off-by: Wen Gu --- net/smc/smc_core.h | 10 ++++++++-- net/smc/smc_wr.c | 35 ++++++++++++++++++++++++++++------- net/smc/smc_wr.h | 5 ++--- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 975cdcc910b7..98bc785806cf 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -107,7 +107,10 @@ struct smc_link { unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ - atomic_t wr_tx_refcnt; /* tx refs to link */ + struct { + struct percpu_ref wr_tx_refs; + } ____cacheline_aligned_in_smp; + struct completion tx_ref_comp; struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ @@ -121,7 +124,10 @@ struct smc_link { struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ - atomic_t wr_reg_refcnt; /* reg refs to link */ + struct { + struct percpu_ref wr_reg_refs; + } ____cacheline_aligned_in_smp; + struct completion reg_ref_comp; enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 26f8f240d9e8..f3daef88fca8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -377,12 +377,11 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (rc) return rc; - atomic_inc(&link->wr_reg_refcnt); + percpu_ref_get(&link->wr_reg_refs); rc = wait_event_interruptible_timeout(link->wr_reg_wait, (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); - if (atomic_dec_and_test(&link->wr_reg_refcnt)) - wake_up_all(&link->wr_reg_wait); + percpu_ref_put(&link->wr_reg_refs); if (!rc) { /* timeout - terminate link */ smcr_link_down_cond_sched(link); @@ -643,8 +642,10 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_wakeup_tx_wait(lnk); smc_wr_tx_wait_no_pending_sends(lnk); - wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); - wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); + percpu_ref_kill(&lnk->wr_reg_refs); + wait_for_completion(&lnk->reg_ref_comp); + percpu_ref_kill(&lnk->wr_tx_refs); + wait_for_completion(&lnk->tx_ref_comp); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, @@ -843,6 +844,20 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev) tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } +static void smcr_wr_tx_refs_free(struct percpu_ref *ref) +{ + struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs); + + complete(&lnk->tx_ref_comp); +} + +static void smcr_wr_reg_refs_free(struct percpu_ref *ref) +{ + struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs); + + complete(&lnk->reg_ref_comp); +} + int smc_wr_create_link(struct smc_link *lnk) { struct ib_device *ibdev = lnk->smcibdev->ibdev; @@ -886,9 +901,15 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_init_sge(lnk); bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); init_waitqueue_head(&lnk->wr_tx_wait); - atomic_set(&lnk->wr_tx_refcnt, 0); + rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); + if (rc) + goto dma_unmap; + init_completion(&lnk->tx_ref_comp); init_waitqueue_head(&lnk->wr_reg_wait); - atomic_set(&lnk->wr_reg_refcnt, 0); + rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); + if (rc) + goto dma_unmap; + init_completion(&lnk->reg_ref_comp); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..f0a4343e39d0 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -63,14 +63,13 @@ static inline bool smc_wr_tx_link_hold(struct smc_link *link) { if (!smc_link_sendable(link)) return false; - atomic_inc(&link->wr_tx_refcnt); + percpu_ref_get(&link->wr_tx_refs); return true; } static inline void smc_wr_tx_link_put(struct smc_link *link) { - if (atomic_dec_and_test(&link->wr_tx_refcnt)) - wake_up_all(&link->wr_tx_wait); + percpu_ref_put(&link->wr_tx_refs); } static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) -- Gitee From 44e10e26eb942dbb09fea97ceb5b409ee2bac89b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 18:29:48 +0000 Subject: [PATCH 17/21] net: deal with most data-races in sk_wait_event() ANBZ: #5534 commit d0ac89f6f9879fae316c155de77b5173b3e2c9c9 upstream. __condition is evaluated twice in sk_wait_event() macro. First invocation is lockless, and reads can race with writes, as spotted by syzbot. BUG: KCSAN: data-race in sk_stream_wait_connect / tcp_disconnect write to 0xffff88812d83d6a0 of 4 bytes by task 9065 on cpu 1: tcp_disconnect+0x2cd/0xdb0 inet_shutdown+0x19e/0x1f0 net/ipv4/af_inet.c:911 __sys_shutdown_sock net/socket.c:2343 [inline] __sys_shutdown net/socket.c:2355 [inline] __do_sys_shutdown net/socket.c:2363 [inline] __se_sys_shutdown+0xf8/0x140 net/socket.c:2361 __x64_sys_shutdown+0x31/0x40 net/socket.c:2361 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88812d83d6a0 of 4 bytes by task 9040 on cpu 0: sk_stream_wait_connect+0x1de/0x3a0 net/core/stream.c:75 tcp_sendmsg_locked+0x2e4/0x2120 net/ipv4/tcp.c:1266 tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1484 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:651 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] __sys_sendto+0x246/0x300 net/socket.c:2142 __do_sys_sendto net/socket.c:2154 [inline] __se_sys_sendto net/socket.c:2150 [inline] __x64_sys_sendto+0x78/0x90 net/socket.c:2150 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00000000 -> 0x00000068 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/core/stream.c | 12 ++++++------ net/ipv4/tcp_bpf.c | 2 +- net/llc/af_llc.c | 8 +++++--- net/smc/smc_close.c | 4 ++-- net/smc/smc_rx.c | 4 ++-- net/smc/smc_tx.c | 4 ++-- net/tipc/socket.c | 4 ++-- net/tls/tls_main.c | 3 ++- 8 files changed, 22 insertions(+), 19 deletions(-) diff --git a/net/core/stream.c b/net/core/stream.c index a166a32b411f..0d4457f54f62 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -73,8 +73,8 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, - !sk->sk_err && - !((1 << sk->sk_state) & + !READ_ONCE(sk->sk_err) && + !((1 << READ_ONCE(sk->sk_state)) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; @@ -87,9 +87,9 @@ EXPORT_SYMBOL(sk_stream_wait_connect); * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ -static inline int sk_stream_closing(struct sock *sk) +static int sk_stream_closing(const struct sock *sk) { - return (1 << sk->sk_state) & + return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } @@ -142,8 +142,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && !vm_wait), &wait); sk->sk_write_pending--; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index eaf2308c355a..1e15cc76cd78 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -258,7 +258,7 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); + !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 99a37c411323..01e26698285a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -582,7 +582,8 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) add_wait_queue(sk_sleep(sk), &wait); while (1) { - if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait)) + if (sk_wait_event(sk, &timeout, + READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) @@ -602,7 +603,8 @@ static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) add_wait_queue(sk_sleep(sk), &wait); while (1) { - if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait)) + if (sk_wait_event(sk, &timeout, + READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; @@ -621,7 +623,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) while (1) { rc = 0; if (sk_wait_event(sk, &timeout, - (sk->sk_shutdown & RCV_SHUTDOWN) || + (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 31db7438857c..dbdf03e8aa5b 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -67,8 +67,8 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - sk->sk_err == ECONNABORTED || - sk->sk_err == ECONNRESET || + READ_ONCE(sk->sk_err) == ECONNABORTED || + READ_ONCE(sk->sk_err) == ECONNRESET || smc->conn.killed, &wait); if (rc) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 5a50c159e747..25d077018813 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -264,9 +264,9 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, - sk->sk_err || + READ_ONCE(sk->sk_err) || cflags->peer_conn_abort || - sk->sk_shutdown & RCV_SHUTDOWN || + READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || fcrit(conn), &wait); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 64dedffe9d26..c45579f56daf 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -113,8 +113,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, - sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 38256aabf4f1..9334817a699f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -300,9 +300,9 @@ static void tsk_rej_rx_queue(struct sock *sk, int error) tipc_sk_respond(sk, skb, error); } -static bool tipc_sk_connected(struct sock *sk) +static bool tipc_sk_connected(const struct sock *sk) { - return sk->sk_state == TIPC_ESTABLISHED; + return READ_ONCE(sk->sk_state) == TIPC_ESTABLISHED; } /* tipc_sk_type_connectionless - check if the socket is datagram socket diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index bcd6f01594ee..b52a3e64b3c9 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -92,7 +92,8 @@ int wait_on_pending_writer(struct sock *sk, long *timeo) break; } - if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait)) + if (sk_wait_event(sk, timeo, + !READ_ONCE(sk->sk_write_pending), &wait)) break; } remove_wait_queue(sk_sleep(sk), &wait); -- Gitee From f93a72427c377bae571dbe727df4310404a3ce64 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 18 May 2023 13:14:55 +0800 Subject: [PATCH 18/21] net/smc: Reset connection when trying to use SMCRv2 fails. ANBZ: #5534 commit 35112271672ae98f45df7875244a4e33aa215e31 upstream. We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It can be reproduced by: - smc_run nginx - smc_run wrk -t 32 -c 500 -d 30 http://: BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc] RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06 RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000 RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00 RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180 R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib] ? smcr_buf_map_link+0x24b/0x290 [smc] ? __smc_buf_create+0x4ee/0x9b0 [smc] smc_clc_send_accept+0x4c/0xb0 [smc] smc_listen_work+0x346/0x650 [smc] ? __schedule+0x279/0x820 process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 During the CLC handshake, server sequentially tries available SMCRv2 and SMCRv1 devices in smc_listen_work(). If an SMCRv2 device is found. SMCv2 based link group and link will be assigned to the connection. Then assumed that some buffer assignment errors happen later in the CLC handshake, such as RMB registration failure, server will give up SMCRv2 and try SMCRv1 device instead. But the resources assigned to the connection won't be reset. When server tries SMCRv1 device, the connection creation process will be executed again. Since conn->lnk has been assigned when trying SMCRv2, it will not be set to the correct SMCRv1 link in smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to correct SMCRv1 link group but conn->lnk points to the SMCRv2 link mistakenly. Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx] will be accessed. Since the link->link_idx is not correct, the related MR may not have been initialized, so crash happens. | Try SMCRv2 device first | |-> conn->lgr: assign existed SMCRv2 link group; | |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC); | |-> sndbuf & RMB creation fails, quit; | | Try SMCRv1 device then | |-> conn->lgr: create SMCRv1 link group and assign; | |-> conn->link: keep SMCRv2 link mistakenly; | |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0] | initialized. | | Then smc_clc_send_confirm_accept() accesses | conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash. v This patch tries to fix this by cleaning conn->lnk before assigning link. In addition, it is better to reset the connection and clean the resources assigned if trying SMCRv2 failed in buffer creation or registration. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/ Signed-off-by: Wen Gu Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/af_smc.c | 9 +++++++-- net/smc/smc_core.c | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0b89a37a1a0d..714eb2e27f73 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2002,8 +2002,10 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, return rc; /* create send buffer and rmb */ - if (smc_buf_create(new_smc, false)) + if (smc_buf_create(new_smc, false)) { + smc_conn_abort(new_smc, ini->first_contact_local); return SMC_CLC_DECL_MEM; + } return 0; } @@ -2219,8 +2221,11 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); - if (!rc) + if (!rc) { rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (rc) + smc_conn_abort(new_smc, ini->first_contact_local); + } if (!rc) return; ini->smcr_version = smcr_version; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c2aba418785d..4925ddd4f102 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -127,6 +127,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) int i, j; /* do link balancing */ + conn->lnk = NULL; /* reset conn->lnk first */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; -- Gitee From 025931a7af33c28769d4693793b9fa79a8d6e74f Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 26 May 2023 19:49:00 +0800 Subject: [PATCH 19/21] net/smc: Scan from current RMB list when no position specified ANBZ: #5534 commit c23fd78e3bd58009c291e2d474e8089ea464a07b upstream. When finding the first RMB of link group, it should start from the current RMB list whose index is 0. So fix it. Fixes: b4ba4652b3f8 ("net/smc: extend LLC layer for SMC-Rv2") Signed-off-by: Wen Gu Signed-off-by: Paolo Abeni Signed-off-by: Wen Gu --- net/smc/smc_llc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a09fe34d94eb..c0d3aa5fed99 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -576,7 +576,10 @@ static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, { struct smc_buf_desc *buf_next; - if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + if (!buf_pos) + return _smc_llc_get_next_rmb(lgr, buf_lst); + + if (list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { (*buf_lst)++; return _smc_llc_get_next_rmb(lgr, buf_lst); } -- Gitee From b7cd90a233d38e30afe5d205a930beefe7701f31 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 26 May 2023 19:49:01 +0800 Subject: [PATCH 20/21] net/smc: Don't use RMBs not mapped to new link in SMCRv2 ADD LINK ANBZ: #5534 commit c4ee5865dea20c16ac199fd54d0523bf71dc53a5 upstream. We encountered a crash when using SMCRv2. It is caused by a logical error in smc_llc_fill_ext_v2(). BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 7 PID: 453 Comm: kworker/7:4 Kdump: loaded Tainted: G W E 6.4.0-rc3+ #44 Workqueue: events smc_llc_add_link_work [smc] RIP: 0010:smc_llc_fill_ext_v2+0x117/0x280 [smc] RSP: 0018:ffffacb5c064bd88 EFLAGS: 00010282 RAX: ffff9a6bc1c3c02c RBX: ffff9a6be3558000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000000002 RDI: 000000000000000a RBP: ffffacb5c064bdb8 R08: 0000000000000040 R09: 000000000000000c R10: ffff9a6bc0910300 R11: 0000000000000002 R12: 0000000000000000 R13: 0000000000000002 R14: ffff9a6bc1c3c02c R15: ffff9a6be3558250 FS: 0000000000000000(0000) GS:ffff9a6eefdc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 000000010b078003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: smc_llc_send_add_link+0x1ae/0x2f0 [smc] smc_llc_srv_add_link+0x2c9/0x5a0 [smc] ? cc_mkenc+0x40/0x60 smc_llc_add_link_work+0xb8/0x140 [smc] process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 When an alernate RNIC is available in system, SMC will try to add a new link based on the RNIC for resilience. All the RMBs in use will be mapped to the new link. Then the RMBs' MRs corresponding to the new link will be filled into SMCRv2 LLC ADD LINK messages. However, smc_llc_fill_ext_v2() mistakenly accesses to unused RMBs which haven't been mapped to the new link and have no valid MRs, thus causing a crash. So this patch fixes the logic. Fixes: b4ba4652b3f8 ("net/smc: extend LLC layer for SMC-Rv2") Signed-off-by: Wen Gu Signed-off-by: Paolo Abeni Signed-off-by: Wen Gu --- net/smc/smc_llc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c0d3aa5fed99..57c7ec70b823 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -615,6 +615,8 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, goto out; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); for (i = 0; i < ext->num_rkeys; i++) { + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); if (!buf_pos) break; rmb = buf_pos; @@ -624,8 +626,6 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); - while (buf_pos && !(buf_pos)->used) - buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); } len += i * sizeof(ext->rt[0]); out: -- Gitee From 79fbb7abffb361a6d67834ecad55cb0411fe69f5 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 1 Jun 2023 16:41:52 +0800 Subject: [PATCH 21/21] net/smc: Avoid to access invalid RMBs' MRs in SMCRv1 ADD LINK CONT ANBZ: #5534 commit c308e9ec004721a656c193243eab61a8be324657 upstream. SMCRv1 has a similar issue to SMCRv2 (see link below) that may access invalid MRs of RMBs when construct LLC ADD LINK CONT messages. BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 5 PID: 48 Comm: kworker/5:0 Kdump: loaded Tainted: G W E 6.4.0-rc3+ #49 Workqueue: events smc_llc_add_link_work [smc] RIP: 0010:smc_llc_add_link_cont+0x160/0x270 [smc] RSP: 0018:ffffa737801d3d50 EFLAGS: 00010286 RAX: ffff964f82144000 RBX: ffffa737801d3dd8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff964f81370c30 RBP: ffffa737801d3dd4 R08: ffff964f81370000 R09: ffffa737801d3db0 R10: 0000000000000001 R11: 0000000000000060 R12: ffff964f82e70000 R13: ffff964f81370c38 R14: ffffa737801d3dd3 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff9652bfd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 000000008fa20004 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: smc_llc_srv_rkey_exchange+0xa7/0x190 [smc] smc_llc_srv_add_link+0x3ae/0x5a0 [smc] smc_llc_add_link_work+0xb8/0x140 [smc] process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 When an alernate RNIC is available in system, SMC will try to add a new link based on the RNIC for resilience. All the RMBs in use will be mapped to the new link. Then the RMBs' MRs corresponding to the new link will be filled into LLC messages. For SMCRv1, they are ADD LINK CONT messages. However smc_llc_add_link_cont() may mistakenly access to unused RMBs which haven't been mapped to the new link and have no valid MRs, thus causing a crash. So this patch fixes it. Fixes: 87f88cda2128 ("net/smc: rkey processing for a new link as SMC client") Link: https://lore.kernel.org/r/1685101741-74826-3-git-send-email-guwen@linux.alibaba.com Signed-off-by: Wen Gu Reviewed-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Wen Gu --- net/smc/smc_llc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 57c7ec70b823..ef46137d6379 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -849,6 +849,8 @@ static int smc_llc_add_link_cont(struct smc_link *link, addc_llc->num_rkeys = *num_rkeys_todo; n = *num_rkeys_todo; for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); if (!*buf_pos) { addc_llc->num_rkeys = addc_llc->num_rkeys - *num_rkeys_todo; @@ -865,8 +867,6 @@ static int smc_llc_add_link_cont(struct smc_link *link, (*num_rkeys_todo)--; *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); - while (*buf_pos && !(*buf_pos)->used) - *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); } addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT; addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); -- Gitee