From 8fdd069567a373eb0fcb47cb51840a7ad6b06e3a Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Thu, 1 Jun 2023 20:34:06 +0800 Subject: [PATCH 1/4] RDMA/hns: Fix a missing check of atomic wr length driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ --------------------------------------------------------------- The only supported length of atomic wr in kernel space is 8. The driver needs to check the length before post send. Fixes: 00a59d30f3f9 ("RDMA/hns: Optimize wqe buffer filling process for post send") Signed-off-by: Junxian Huang --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 6c5630c56eb7..f709886a3914 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -94,6 +94,8 @@ /* Configure to HW for PAGE_SIZE larger than 4KB */ #define PG_SHIFT_OFFSET (PAGE_SHIFT - 12) +#define ATOMIC_WR_LEN 8 + #define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 #define SRQ_DB_REG 0x230 diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index faa70c81660e..96ab26cd1833 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -166,15 +166,22 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, hr_reg_clear(fseg, FRMR_BLK_MODE); } -static void set_atomic_seg(const struct ib_send_wr *wr, - struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, - unsigned int valid_num_sge) +static int set_atomic_seg(struct hns_roce_dev *hr_dev, + const struct ib_send_wr *wr, + struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, + unsigned int valid_num_sge, u32 msg_len) { struct hns_roce_v2_wqe_data_seg *dseg = (void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe); struct hns_roce_wqe_atomic_seg *aseg = (void *)dseg + sizeof(struct hns_roce_v2_wqe_data_seg); + if (msg_len != ATOMIC_WR_LEN) { + ibdev_err(&hr_dev->ib_dev, "invalid atomic wr len, len = %u.\n", + msg_len); + return -EINVAL; + } + set_data_seg_v2(dseg, wr->sg_list); if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { @@ -187,6 +194,8 @@ static void set_atomic_seg(const struct ib_send_wr *wr, } hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); + + return 0; } static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, @@ -685,7 +694,8 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - set_atomic_seg(wr, rc_sq_wqe, valid_num_sge); + ret = set_atomic_seg(hr_dev, wr, rc_sq_wqe, valid_num_sge, + msg_len); else if (wr->opcode != IB_WR_REG_MR) ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge); -- Gitee From e96a09a3ded1f8063e078c0b1e0bcf92dcc6ad6a Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 22 Dec 2023 17:05:24 +0800 Subject: [PATCH 2/4] RDMA/hns: Fix Use-After-Free of rsv_qp driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ ---------------------------------------------------------------------- For the HIP08, the reserved loopback QP is used to release MRs before the MPT is destroyed. After free_mr_exit() and before hns_roce_unregister_device(), rsv_qp is released and set to NULL, and ib_device is not unregister. During this period, the user mode can use ib_device to execute dereg_mr(). As a result, rsv_qp is accessed again and a NULL pointer is reported. To fix Use-After-Free of rsv_qp, execute free_mr_exit() after hns_roce_unregister_device(). Fixes: 6f5f556d3795 ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Signed-off-by: wenglianfa Signed-off-by: Juan Zhou --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 96ab26cd1833..04115e82ede5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3151,6 +3151,9 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev) static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_function_clear(hr_dev); if (!hr_dev->is_vf) @@ -7358,9 +7361,6 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) - free_mr_exit(hr_dev); - hns_roce_exit(hr_dev, bond_cleanup); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); -- Gitee From a91843046b2a71e6d9227d53b75f58d01733c8dc Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Wed, 24 Apr 2024 16:04:41 +0800 Subject: [PATCH 3/4] RDMA/hns: Fix sleeping in spin_lock critical section driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ ---------------------------------------------------------------------- Sleep is not allowed in the spinlock critical section, but ib_umem_release() may sleep in the spinlock critical sectio. To fix it, use mutex_lock() instead of spin_lock(). Fixes: 04c5d76e4f15 ("RDMA/hns: Fix simultaneous reset and resource deregistration") Signed-off-by: wenglianfa --- drivers/infiniband/hw/hns/hns_roce_device.h | 4 ++-- drivers/infiniband/hw/hns/hns_roce_main.c | 8 ++++++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 16 ++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f709886a3914..29d5eedecfaf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1152,9 +1152,9 @@ struct hns_roce_dev { struct notifier_block bond_nb; struct list_head mtr_unfree_list; /* list of unfree mtr on this dev */ - spinlock_t mtr_unfree_list_lock; /* protect mtr_unfree_list */ + struct mutex mtr_unfree_list_mutex; /* protect mtr_unfree_list */ struct list_head umem_unfree_list; /* list of unfree umem on this dev */ - spinlock_t umem_unfree_list_lock; /* protect umem_unfree_list */ + struct mutex umem_unfree_list_mutex; /* protect umem_unfree_list */ }; static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 86454de85d61..424dcf69d3b8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -1246,6 +1246,8 @@ static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev) hns_roce_cleanup_dca(hr_dev); hns_roce_cleanup_bitmap(hr_dev); + mutex_destroy(&hr_dev->umem_unfree_list_mutex); + mutex_destroy(&hr_dev->mtr_unfree_list_mutex); mutex_destroy(&hr_dev->uctx_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) @@ -1273,10 +1275,10 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) mutex_init(&hr_dev->uctx_list_mutex); INIT_LIST_HEAD(&hr_dev->mtr_unfree_list); - spin_lock_init(&hr_dev->mtr_unfree_list_lock); + mutex_init(&hr_dev->mtr_unfree_list_mutex); INIT_LIST_HEAD(&hr_dev->umem_unfree_list); - spin_lock_init(&hr_dev->umem_unfree_list_lock); + mutex_init(&hr_dev->umem_unfree_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) { @@ -1320,6 +1322,8 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) mutex_destroy(&hr_dev->pgdir_mutex); + mutex_destroy(&hr_dev->umem_unfree_list_mutex); + mutex_destroy(&hr_dev->mtr_unfree_list_mutex); mutex_destroy(&hr_dev->uctx_list_mutex); return ret; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 59e7ed66d53c..0a6fde734def 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1250,22 +1250,22 @@ void hns_roce_add_unfree_mtr(struct hns_roce_mtr_node *pos, { hns_roce_copy_mtr(&pos->mtr, mtr); - spin_lock(&hr_dev->mtr_unfree_list_lock); + mutex_lock(&hr_dev->mtr_unfree_list_mutex); list_add_tail(&pos->list, &hr_dev->mtr_unfree_list); - spin_unlock(&hr_dev->mtr_unfree_list_lock); + mutex_unlock(&hr_dev->mtr_unfree_list_mutex); } void hns_roce_free_unfree_mtr(struct hns_roce_dev *hr_dev) { struct hns_roce_mtr_node *pos, *next; - spin_lock(&hr_dev->mtr_unfree_list_lock); + mutex_lock(&hr_dev->mtr_unfree_list_mutex); list_for_each_entry_safe(pos, next, &hr_dev->mtr_unfree_list, list) { list_del(&pos->list); hns_roce_mtr_destroy(hr_dev, &pos->mtr); kvfree(pos); } - spin_unlock(&hr_dev->mtr_unfree_list_lock); + mutex_unlock(&hr_dev->mtr_unfree_list_mutex); } void hns_roce_add_unfree_umem(struct hns_roce_user_db_page *user_page, @@ -1275,20 +1275,20 @@ void hns_roce_add_unfree_umem(struct hns_roce_user_db_page *user_page, pos->umem = user_page->umem; - spin_lock(&hr_dev->umem_unfree_list_lock); + mutex_lock(&hr_dev->umem_unfree_list_mutex); list_add_tail(&pos->list, &hr_dev->umem_unfree_list); - spin_unlock(&hr_dev->umem_unfree_list_lock); + mutex_unlock(&hr_dev->umem_unfree_list_mutex); } void hns_roce_free_unfree_umem(struct hns_roce_dev *hr_dev) { struct hns_roce_umem_node *pos, *next; - spin_lock(&hr_dev->umem_unfree_list_lock); + mutex_lock(&hr_dev->umem_unfree_list_mutex); list_for_each_entry_safe(pos, next, &hr_dev->umem_unfree_list, list) { list_del(&pos->list); ib_umem_release(pos->umem); kvfree(pos); } - spin_unlock(&hr_dev->umem_unfree_list_lock); + mutex_unlock(&hr_dev->umem_unfree_list_mutex); } -- Gitee From 8cde9df37098cdda45c259ea2bcb1a569a99bc97 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Sat, 20 Apr 2024 15:34:25 +0800 Subject: [PATCH 4/4] RDMA/hns: Fix soft lockup under heavy CEQE load driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9MANQ ---------------------------------------------------------------------- CEQEs are handled in interrupt handler currently. This may cause the CPU core staying in interrupt context too long and lead to soft lockup under heavy load. Handle CEQEs in tasklet and set an upper limit for the number of CEQE handled by a single call of tasklet. Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08") Signed-off-by: Junxian Huang --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 80 ++++++++++++--------- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 29d5eedecfaf..399dc3fd5791 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -802,6 +802,7 @@ struct hns_roce_eq { int shift; int event_type; int sub_type; + struct tasklet_struct tasklet; }; struct hns_roce_eq_table { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 04115e82ede5..f14aec726858 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6472,33 +6472,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_eq *eq) { - struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - irqreturn_t ceqe_found = IRQ_NONE; - u32 cqn; - - while (ceqe) { - /* Make sure we read CEQ entry after we have checked the - * ownership bit - */ - dma_rmb(); - - cqn = hr_reg_read(ceqe, CEQE_CQN); + tasklet_schedule(&eq->tasklet); - hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqe_found = IRQ_HANDLED; - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); - - ceqe = next_ceqe_sw_v2(eq); - } - - update_eq_db(eq); - - return IRQ_RETVAL(ceqe_found); + return IRQ_HANDLED; } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) @@ -6509,7 +6487,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ - int_work = hns_roce_v2_ceq_int(hr_dev, eq); + int_work = hns_roce_v2_ceq_int(eq); else /* Asynchronous event interrupt */ int_work = hns_roce_v2_aeq_int(hr_dev, eq); @@ -6877,6 +6855,34 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, return ret; } +static void hns_roce_ceq_task(struct tasklet_struct *task) +{ + struct hns_roce_eq *eq = from_tasklet(eq, task, tasklet); + struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); + struct hns_roce_dev *hr_dev = eq->hr_dev; + int ceqe_num = 0; + u32 cqn; + + while (ceqe && ceqe_num < hr_dev->caps.ceqe_depth) { + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = hr_reg_read(ceqe, CEQE_CQN); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ++ceqe_num; + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); + + ceqe = next_ceqe_sw_v2(eq); + } + + update_eq_db(eq); +} + static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, int comp_num, int aeq_num, int other_num) { @@ -6908,21 +6914,24 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, j - other_num - aeq_num); for (j = 0; j < irq_num; j++) { - if (j < other_num) + if (j < other_num) { ret = request_irq(hr_dev->irq[j], hns_roce_v2_msix_interrupt_abn, 0, hr_dev->irq_names[j], hr_dev); - - else if (j < (other_num + comp_num)) + } else if (j < (other_num + comp_num)) { + tasklet_setup(&eq_table->eq[j - other_num].tasklet, + hns_roce_ceq_task); ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j + aeq_num], &eq_table->eq[j - other_num]); - else + } else { ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j - comp_num], &eq_table->eq[j - other_num]); + } + if (ret) { dev_err(hr_dev->dev, "request irq error!\n"); goto err_request_failed; @@ -6933,11 +6942,13 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, err_request_failed: for (j -= 1; j >= 0; j--) - if (j < other_num) + if (j < other_num) { free_irq(hr_dev->irq[j], hr_dev); - else + } else { free_irq(eq_table->eq[j - other_num].irq, &eq_table->eq[j - other_num]); + tasklet_kill(&eq_table->eq[j - other_num].tasklet); + } err_kzalloc_failed: for (i -= 1; i >= 0; i--) @@ -6958,8 +6969,11 @@ static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev) for (i = 0; i < hr_dev->caps.num_other_vectors; i++) free_irq(hr_dev->irq[i], hr_dev); - for (i = 0; i < eq_num; i++) + for (i = 0; i < eq_num; i++) { free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]); + if (i < hr_dev->caps.num_comp_vectors) + tasklet_kill(&hr_dev->eq_table.eq[i].tasklet); + } for (i = 0; i < irq_num; i++) kfree(hr_dev->irq_names[i]); -- Gitee