From a2fe6fdfc37c3b39b00e03fa8436b2a93db1280f Mon Sep 17 00:00:00 2001 From: tianx Date: Wed, 21 Aug 2024 15:19:45 +0800 Subject: [PATCH 1/2] drivers: Add GDR(GPU Direct RDMA) support yunsilicon inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IALL3Y CVE: NA ------------------------------------------ Add GPU Direct RDMA support Reviewed-by: Wei Honggang Reviewed-by: Wang Saochuang Signed-off-by: Tian Xin --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + drivers/infiniband/Kconfig | 10 + drivers/infiniband/core/Makefile | 1 + drivers/infiniband/core/ib_peer_mem.h | 65 +++ drivers/infiniband/core/peer_mem.c | 525 +++++++++++++++++++++++++ drivers/infiniband/core/umem.c | 53 ++- drivers/infiniband/hw/xsc/ib_umem_ex.c | 8 +- drivers/infiniband/hw/xsc/ib_umem_ex.h | 4 +- drivers/infiniband/hw/xsc/mr.c | 21 +- include/rdma/ib_umem.h | 39 ++ include/rdma/peer_mem.h | 176 +++++++++ 12 files changed, 888 insertions(+), 16 deletions(-) create mode 100644 drivers/infiniband/core/ib_peer_mem.h create mode 100644 drivers/infiniband/core/peer_mem.c create mode 100644 include/rdma/peer_mem.h diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 650fe88cbb04..45d9443367e1 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6186,6 +6186,7 @@ CONFIG_INFINIBAND_ON_DEMAND_PAGING=y CONFIG_INFINIBAND_ADDR_TRANS=y CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y CONFIG_INFINIBAND_VIRT_DMA=y +CONFIG_INFINIBAND_PEER_MEMORY=y CONFIG_INFINIBAND_BNXT_RE=m CONFIG_INFINIBAND_CXGB4=m # CONFIG_INFINIBAND_EFA is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index c7a1a37fb32e..577a8cf15930 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -6855,6 +6855,7 @@ CONFIG_INFINIBAND_ON_DEMAND_PAGING=y CONFIG_INFINIBAND_ADDR_TRANS=y CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y CONFIG_INFINIBAND_VIRT_DMA=y +CONFIG_INFINIBAND_PEER_MEMORY=y CONFIG_INFINIBAND_BNXT_RE=m CONFIG_INFINIBAND_CXGB4=m # CONFIG_INFINIBAND_EFA is not set diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index c80ccc2bb69b..d62ab3edc8c3 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -74,6 +74,16 @@ config INFINIBAND_ADDR_TRANS_CONFIGFS This allows the user to config the default GID type that the CM uses for each device, when initiaing new connections. +config INFINIBAND_PEER_MEMORY + bool "InfiniBand peer memory support" + depends on INFINIBAND_USER_MEM + default n + help + Peer memory support for the InfinBand subsystem. This + enables GPU drivers to provide peer memory operations + and allows InfiniBand hardware drivers to utilize GPU + peer memory. + config INFINIBAND_VIRT_DMA def_bool !HIGHMEM diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 8ab4eea5a0a5..ae0c1349173b 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -41,4 +41,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_wq.o \ uverbs_std_types_qp.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o +ib_uverbs-$(CONFIG_INFINIBAND_PEER_MEMORY) += peer_mem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h new file mode 100644 index 000000000000..988012340ce6 --- /dev/null +++ b/drivers/infiniband/core/ib_peer_mem.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + * Copyright (C) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All Rights Reserved. + */ +#ifndef RDMA_IB_PEER_MEM_H +#define RDMA_IB_PEER_MEM_H + +#include +#include +#include +#include + +struct ib_peer_memory_statistics { + atomic64_t num_alloc_mrs; + atomic64_t num_dealloc_mrs; + atomic64_t num_reg_pages; + atomic64_t num_dereg_pages; + atomic64_t num_reg_bytes; + atomic64_t num_dereg_bytes; + unsigned long num_free_callbacks; +}; + +struct ib_peer_memory_client { + refcount_t usecnt; + struct completion usecnt_zero; + const struct peer_memory_client *peer_mem; + struct list_head core_peer_list; + struct ib_peer_memory_statistics stats; + struct xarray umem_xa; + u32 xa_cyclic_next; + bool invalidation_required; +}; + +enum ib_umem_mapped_state { + UMEM_PEER_UNMAPPED, + UMEM_PEER_MAPPED, + UMEM_PEER_INVALIDATED, +}; + +struct ib_umem_peer { + struct ib_umem umem; + struct kref kref; + /* peer memory that manages this umem */ + struct ib_peer_memory_client *ib_peer_client; + void *peer_client_context; + umem_invalidate_func_t invalidation_func; + void *invalidation_private; + struct mutex mapping_lock; + enum ib_umem_mapped_state mapped_state; + u32 xa_id; + struct scatterlist *first_sg; + dma_addr_t first_dma_address; + unsigned int first_dma_length; + unsigned int first_length; + struct scatterlist *last_sg; + unsigned int last_dma_length; + unsigned int last_length; +}; + +struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret, + unsigned long peer_mem_flags); +void ib_peer_umem_release(struct ib_umem *umem); + +#endif diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c new file mode 100644 index 000000000000..3c241054a138 --- /dev/null +++ b/drivers/infiniband/core/peer_mem.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + * Copyright (C) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All Rights Reserved. + */ + +#include +#include +#include +#include "ib_peer_mem.h" + +static DEFINE_MUTEX(peer_memory_mutex); +static LIST_HEAD(peer_memory_list); +#define PEER_NO_INVALIDATION_ID U32_MAX + +static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context); + +static int ib_memory_peer_check_mandatory(const struct peer_memory_client + *peer_client) +{ +#define PEER_MEM_MANDATORY_FUNC(x) {offsetof(struct peer_memory_client, x), #x} + int i; + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + PEER_MEM_MANDATORY_FUNC(acquire), + PEER_MEM_MANDATORY_FUNC(get_pages), + PEER_MEM_MANDATORY_FUNC(put_pages), + PEER_MEM_MANDATORY_FUNC(dma_map), + PEER_MEM_MANDATORY_FUNC(dma_unmap), + }; + + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { + if (!*(void **)((void *)peer_client + + mandatory_table[i].offset)) { + pr_err("Peer memory %s is missing mandatory function %s\n", + peer_client->name, mandatory_table[i].name); + return -EINVAL; + } + } + + return 0; +} + +void * +ib_register_peer_memory_client(const struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback) +{ + struct ib_peer_memory_client *ib_peer_client; + + if (ib_memory_peer_check_mandatory(peer_client)) + return NULL; + + ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL); + if (!ib_peer_client) + return NULL; + refcount_set(&ib_peer_client->usecnt, 1); + init_completion(&ib_peer_client->usecnt_zero); + ib_peer_client->peer_mem = peer_client; + xa_init_flags(&ib_peer_client->umem_xa, XA_FLAGS_ALLOC); + + /* + * If the peer wants the invalidation_callback then all memory users + * linked to that peer must support invalidation. + */ + if (invalidate_callback) { + *invalidate_callback = ib_invalidate_peer_memory; + ib_peer_client->invalidation_required = true; + } + + mutex_lock(&peer_memory_mutex); + list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list); + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; +} +EXPORT_SYMBOL(ib_register_peer_memory_client); + +void ib_unregister_peer_memory_client(void *reg_handle) +{ + struct ib_peer_memory_client *ib_peer_client = reg_handle; + + mutex_lock(&peer_memory_mutex); + list_del(&ib_peer_client->core_peer_list); + mutex_unlock(&peer_memory_mutex); + + /* + * Wait for all umems to be destroyed before returning. Once + * ib_unregister_peer_memory_client() returns no umems will call any + * peer_mem ops. + */ + if (refcount_dec_and_test(&ib_peer_client->usecnt)) + complete(&ib_peer_client->usecnt_zero); + wait_for_completion(&ib_peer_client->usecnt_zero); +} +EXPORT_SYMBOL(ib_unregister_peer_memory_client); + +static struct ib_peer_memory_client * +ib_get_peer_client(unsigned long addr, size_t size, + unsigned long peer_mem_flags, void **peer_client_context) +{ + struct ib_peer_memory_client *ib_peer_client; + int ret = 0; + + mutex_lock(&peer_memory_mutex); + list_for_each_entry(ib_peer_client, &peer_memory_list, + core_peer_list) { + if (ib_peer_client->invalidation_required && + (!(peer_mem_flags & IB_PEER_MEM_INVAL_SUPP))) + continue; + ret = ib_peer_client->peer_mem->acquire(addr, size, NULL, NULL, + peer_client_context); + if (ret > 0) { + refcount_inc(&ib_peer_client->usecnt); + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; + } + } + mutex_unlock(&peer_memory_mutex); + return NULL; +} + +static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, + void *peer_client_context) +{ + if (ib_peer_client->peer_mem->release) + ib_peer_client->peer_mem->release(peer_client_context); + if (refcount_dec_and_test(&ib_peer_client->usecnt)) + complete(&ib_peer_client->usecnt_zero); +} + +static void ib_peer_umem_kref_release(struct kref *kref) +{ + struct ib_umem_peer *umem_p = + container_of(kref, struct ib_umem_peer, kref); + + mutex_destroy(&umem_p->mapping_lock); + kfree(umem_p); +} + +static void ib_unmap_peer_client(struct ib_umem_peer *umem_p, + enum ib_umem_mapped_state cur_state, + enum ib_umem_mapped_state to_state) +{ + struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client; + const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem; + struct ib_umem *umem = &umem_p->umem; + + if (cur_state == UMEM_PEER_MAPPED && + (to_state == UMEM_PEER_UNMAPPED || + to_state == UMEM_PEER_INVALIDATED)) { + /* + * In the invalidated state we will never touch the sg again, + * but the client might, so fix it anyhow. + */ + if (umem_p->last_sg) { + umem_p->last_sg->length = umem_p->last_length; + sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length; + } + + if (umem_p->first_sg) { + umem_p->first_sg->dma_address = + umem_p->first_dma_address; + umem_p->first_sg->length = umem_p->first_length; + sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length; + } + + if (to_state == UMEM_PEER_UNMAPPED) { + peer_mem->dma_unmap(&umem_p->umem.sgt_append.sgt, + umem_p->peer_client_context, + umem_p->umem.ibdev->dma_device); + peer_mem->put_pages(&umem_p->umem.sgt_append.sgt, + umem_p->peer_client_context); + } + + memset(&umem->sgt_append, 0, sizeof(umem->sgt_append)); + atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs); + } + + if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) || + (cur_state == UMEM_PEER_INVALIDATED && + to_state == UMEM_PEER_UNMAPPED)) { + atomic64_add(umem->sgt_append.sgt.nents, + &ib_peer_client->stats.num_dereg_pages); + atomic64_add(umem->length, + &ib_peer_client->stats.num_dereg_bytes); + } + umem_p->mapped_state = to_state; +} + +/* + * True if the client should do unmap itself after the invalidate callback + * returns. Clients operating in this mode need to use this locking pattern: + * + * client_invalidate: + * mutex_lock(&client_lock) + * invalidate_callback(): + * mutex_lock(mapping_lock) + * mutex_unlock(mapping_lock) + * client_dma_unmap() + * client_put_pages() + * mutex_unlock(&client_lock) + * + * ib_umem_stop_invalidation_notifier(): + * mutex_lock(mapping_lock) + * mutex_unlock(mapping_lock) + * peer_mem->dma_unmap(): + * mutex_lock(&client_lock) + * client_dma_unmap() + * mutex_unlock(&client_lock) + * peer_mem->put_pages(): + * mutex_lock(&client_lock) + * client_put_pages() + * mutex_unlock(&client_lock) + * + * ib_peer_umem_release(): + * peer_mem->release(): + * mutex_lock(&client_lock) + * mutex_unlock(&client_lock) + * + * Noting that dma_unmap/put_pages can be called even though invalidate has + * already done the unmap, and release() can be called concurrently with + * invalidate. The client must protect itself against these races. + */ +static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p) +{ + const struct peer_memory_client *peer_mem = + umem_p->ib_peer_client->peer_mem; + const struct peer_memory_client_ex *peer_mem_ex; + + if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0) + return false; + peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex, + client); + if (peer_mem_ex->ex_size < + offsetofend(struct peer_memory_client_ex, flags)) + return false; + return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS; +} + +static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context) +{ + struct ib_peer_memory_client *ib_peer_client = reg_handle; + struct ib_umem_peer *umem_p; + + /* + * The client is not required to fence against invalidation during + * put_pages() as that would deadlock when we call put_pages() here. + * Thus the core_context cannot be a umem pointer as we have no control + * over the lifetime. Since we won't change the kABI for this to add a + * proper kref, an xarray is used. + */ + xa_lock(&ib_peer_client->umem_xa); + ib_peer_client->stats.num_free_callbacks += 1; + umem_p = xa_load(&ib_peer_client->umem_xa, core_context); + if (!umem_p) + goto out_unlock; + kref_get(&umem_p->kref); + xa_unlock(&ib_peer_client->umem_xa); + + mutex_lock(&umem_p->mapping_lock); + /* + * For flows that require invalidation the invalidation_func should not + * be NULL while the device can be doing DMA. The mapping_lock ensures + * that the device is ready to receive an invalidation before one is + * triggered here. + */ + if (umem_p->mapped_state == UMEM_PEER_MAPPED && + umem_p->invalidation_func) + umem_p->invalidation_func(&umem_p->umem, + umem_p->invalidation_private); + if (ib_peer_unmap_on_invalidate(umem_p)) + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_INVALIDATED); + else + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_UNMAPPED); + mutex_unlock(&umem_p->mapping_lock); + kref_put(&umem_p->kref, ib_peer_umem_kref_release); + return 0; + +out_unlock: + xa_unlock(&ib_peer_client->umem_xa); + return 0; +} + +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *priv) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + + if (WARN_ON(!umem->is_peer)) + return; + if (umem_p->xa_id == PEER_NO_INVALIDATION_ID) + return; + + umem_p->invalidation_func = func; + umem_p->invalidation_private = priv; + /* Pairs with the lock in ib_peer_umem_get() */ + mutex_unlock(&umem_p->mapping_lock); + + /* At this point func can be called asynchronously */ +} +EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); + +/* + * Caller has blocked DMA and will no longer be able to handle invalidate + * callbacks. Callers using invalidation must call this function before calling + * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional + * before doing this. + */ +void ib_umem_stop_invalidation_notifier(struct ib_umem *umem) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p); + enum ib_umem_mapped_state cur_state; + + if (umem_p->invalidation_func) { + mutex_lock(&umem_p->mapping_lock); + umem_p->invalidation_func = NULL; + } else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) { + mutex_lock(&umem_p->mapping_lock); + } else { + /* + * Haven't called ib_umem_activate_invalidation_notifier() yet, + * still have the lock + */ + } + + if (!unmap_on_invalidate) { + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_UNMAPPED); + } else { + /* Block ib_invalidate_peer_memory() */ + cur_state = umem_p->mapped_state; + umem_p->mapped_state = UMEM_PEER_UNMAPPED; + } + mutex_unlock(&umem_p->mapping_lock); + + if (unmap_on_invalidate) + ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED); +} +EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier); + +static void fix_peer_sgls(struct ib_umem_peer *umem_p, + unsigned long peer_page_size) +{ + struct ib_umem *umem = &umem_p->umem; + struct scatterlist *sg; + int i; + + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) { + if (i == 0) { + unsigned long offset; + + umem_p->first_sg = sg; + umem_p->first_dma_address = sg->dma_address; + umem_p->first_dma_length = sg_dma_len(sg); + umem_p->first_length = sg->length; + + offset = ALIGN_DOWN(umem->address, PAGE_SIZE) - + ALIGN_DOWN(umem->address, peer_page_size); + sg->dma_address += offset; + sg_dma_len(sg) -= offset; + sg->length -= offset; + } + + if (i == umem->sgt_append.sgt.nents - 1) { + unsigned long trim; + + umem_p->last_sg = sg; + umem_p->last_dma_length = sg_dma_len(sg); + umem_p->last_length = sg->length; + + trim = ALIGN(umem->address + umem->length, + peer_page_size) - + ALIGN(umem->address + umem->length, PAGE_SIZE); + sg_dma_len(sg) -= trim; + sg->length -= trim; + } + } +} + +struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret, + unsigned long peer_mem_flags) +{ + struct ib_peer_memory_client *ib_peer_client; + unsigned long peer_page_size; + void *peer_client_context; + struct ib_umem_peer *umem_p; + int ret; + + ib_peer_client = + ib_get_peer_client(old_umem->address, old_umem->length, + peer_mem_flags, &peer_client_context); + if (!ib_peer_client) + return ERR_PTR(old_ret); + + umem_p = kzalloc(sizeof(*umem_p), GFP_KERNEL); + if (!umem_p) { + ret = -ENOMEM; + goto err_client; + } + + kref_init(&umem_p->kref); + umem_p->umem = *old_umem; + memset(&umem_p->umem.sgt_append, 0, sizeof(umem_p->umem.sgt_append)); + umem_p->umem.is_peer = 1; + umem_p->ib_peer_client = ib_peer_client; + umem_p->peer_client_context = peer_client_context; + mutex_init(&umem_p->mapping_lock); + umem_p->xa_id = PEER_NO_INVALIDATION_ID; + + mutex_lock(&umem_p->mapping_lock); + if (ib_peer_client->invalidation_required) { + ret = xa_alloc_cyclic(&ib_peer_client->umem_xa, &umem_p->xa_id, + umem_p, + XA_LIMIT(0, PEER_NO_INVALIDATION_ID - 1), + &ib_peer_client->xa_cyclic_next, + GFP_KERNEL); + if (ret < 0) + goto err_umem; + } + + /* + * We always request write permissions to the pages, to force breaking + * of any CoW during the registration of the MR. For read-only MRs we + * use the "force" flag to indicate that CoW breaking is required but + * the registration should not fail if referencing read-only areas. + */ + ret = ib_peer_client->peer_mem->get_pages(umem_p->umem.address, + umem_p->umem.length, 1, + !umem_p->umem.writable, NULL, + peer_client_context, + umem_p->xa_id); + if (ret) + goto err_xa; + + ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sgt_append.sgt, + peer_client_context, + umem_p->umem.ibdev->dma_device, 0, + &umem_p->umem.sgt_append.sgt.nents); + if (ret) + goto err_pages; + + peer_page_size = + ib_peer_client->peer_mem->get_page_size(peer_client_context); + if (peer_page_size != PAGE_SIZE) + fix_peer_sgls(umem_p, peer_page_size); + + umem_p->mapped_state = UMEM_PEER_MAPPED; + atomic64_add(umem_p->umem.sgt_append.sgt.nents, + &ib_peer_client->stats.num_reg_pages); + atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes); + atomic64_inc(&ib_peer_client->stats.num_alloc_mrs); + + /* + * If invalidation is allowed then the caller must call + * ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to + * unlock this mutex. This call should be done after the last read to + * sg_head, once the caller is ready for the invalidation function to be + * called. + */ + if (umem_p->xa_id == PEER_NO_INVALIDATION_ID) + mutex_unlock(&umem_p->mapping_lock); + + /* + * On success the old umem is replaced with the new, larger, allocation + */ + kfree(old_umem); + return &umem_p->umem; + +err_pages: + ib_peer_client->peer_mem->put_pages(&umem_p->umem.sgt_append.sgt, + umem_p->peer_client_context); +err_xa: + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) + xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id); +err_umem: + mutex_unlock(&umem_p->mapping_lock); + kref_put(&umem_p->kref, ib_peer_umem_kref_release); +err_client: + ib_put_peer_client(ib_peer_client, peer_client_context); + return ERR_PTR(ret); +} + +void ib_peer_umem_release(struct ib_umem *umem) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + + /* + * If ib_umem_activate_invalidation_notifier() is called then + * ib_umem_stop_invalidation_notifier() must be called before release. + */ + WARN_ON(umem_p->invalidation_func); + + /* For no invalidation cases, make sure it is unmapped */ + ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED); + + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) + xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id); + ib_put_peer_client(umem_p->ib_peer_client, umem_p->peer_client_context); + umem_p->ib_peer_client = NULL; + + /* Must match ib_umem_release() */ + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); + mmdrop(umem->owning_mm); + + kref_put(&umem_p->kref, ib_peer_umem_kref_release); +} + +/* Use it like this: + * struct peer_memory_client_ex peer_memory_test = { + * .client = { + * .version = "1.0", + * .version[IB_PEER_MEMORY_VER_MAX-1] = 1, + * }, + * .ex_size = sizeof(struct peer_memory_client_ex), + * .flags = PEER_MEM_INVALIDATE_UNMAPS, + * }; + */ diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 07c571c7b699..68b8177423db 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -44,6 +44,9 @@ #include #include "uverbs.h" +#ifdef CONFIG_INFINIBAND_PEER_MEMORY +#include "ib_peer_mem.h" +#endif static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { @@ -132,15 +135,17 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** - * ib_umem_get - Pin and DMA map userspace memory. + * __ib_umem_get - Pin and DMA map userspace memory. * * @device: IB device to connect UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned + * @peer_mem_flags: IB_PEER_MEM_xxx flags for memory being used */ -struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, - size_t size, int access) +static struct ib_umem *__ib_umem_get(struct ib_device *device, + unsigned long addr, size_t size, int access, + unsigned long peer_mem_flags) { struct ib_umem *umem; struct page **page_list; @@ -243,6 +248,27 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, umem_release: __ib_umem_release(device, umem, 0); + +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + /* + * If the address belongs to peer memory client, then the first + * call to get_user_pages will fail. In this case, try to get + * these pages from the peers. + */ + if (ret < 0 && peer_mem_flags & IB_PEER_MEM_ALLOW) { + struct ib_umem *new_umem; + + new_umem = ib_peer_umem_get(umem, ret, peer_mem_flags); + if (IS_ERR(new_umem)) { + ret = PTR_ERR(new_umem); + goto vma; + } + umem = new_umem; + ret = 0; + goto out; + } +vma: +#endif atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: free_page((unsigned long) page_list); @@ -253,8 +279,25 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, } return ret ? ERR_PTR(ret) : umem; } + +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, + size_t size, int access) +{ + return __ib_umem_get(device, addr, size, access, 0); +} EXPORT_SYMBOL(ib_umem_get); +#ifdef CONFIG_INFINIBAND_PEER_MEMORY +struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, + size_t size, int access, + unsigned long peer_mem_flags) +{ + return __ib_umem_get(device, addr, size, access, + IB_PEER_MEM_ALLOW | peer_mem_flags); +} +EXPORT_SYMBOL(ib_umem_get_peer); +#endif + /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release @@ -268,6 +311,10 @@ void ib_umem_release(struct ib_umem *umem) if (umem->is_odp) return ib_umem_odp_release(to_ib_umem_odp(umem)); +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + if (umem->is_peer) + return ib_peer_umem_release(umem); +#endif __ib_umem_release(umem->ibdev, umem, 1); atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); diff --git a/drivers/infiniband/hw/xsc/ib_umem_ex.c b/drivers/infiniband/hw/xsc/ib_umem_ex.c index 58ded090b523..8cd710b0502d 100644 --- a/drivers/infiniband/hw/xsc/ib_umem_ex.c +++ b/drivers/infiniband/hw/xsc/ib_umem_ex.c @@ -6,7 +6,7 @@ #include -#ifndef MLX_PEER_SUPPORT +#ifndef CONFIG_INFINIBAND_PEER_MEMORY #include "ib_peer_mem.h" #endif @@ -90,15 +90,15 @@ struct ib_umem_ex *ib_umem_ex(struct ib_umem *umem) if (!umem) return ERR_PTR(-EINVAL); -#ifndef MLX_PEER_SUPPORT +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + ret_umem = (struct ib_umem_ex *)umem; +#else ret_umem = kzalloc(sizeof(*ret_umem), GFP_KERNEL); if (!ret_umem) return ERR_PTR(-ENOMEM); ret_umem->umem = *umem; kfree(umem); -#else - ret_umem = (struct ib_umem_ex *)umem; #endif return ret_umem; } diff --git a/drivers/infiniband/hw/xsc/ib_umem_ex.h b/drivers/infiniband/hw/xsc/ib_umem_ex.h index cedf13f02108..034d1c55e5aa 100644 --- a/drivers/infiniband/hw/xsc/ib_umem_ex.h +++ b/drivers/infiniband/hw/xsc/ib_umem_ex.h @@ -15,7 +15,7 @@ struct invalidation_ctx; // ib umem ex ib_umem add peer memory support struct ib_umem_ex { struct ib_umem umem; -#ifndef MLX_PEER_SUPPORT +#ifndef CONFIG_INFINIBAND_PEER_MEMORY struct ib_peer_memory_client *ib_peer_mem; struct invalidation_ctx *invalidation_ctx; void *peer_mem_client_context; @@ -25,7 +25,7 @@ struct ib_umem_ex { // expand ib_umem to ib_umem_ex by reallocate struct ib_umem_ex *ib_umem_ex(struct ib_umem *umem); -#ifndef MLX_PEER_SUPPORT +#ifndef CONFIG_INFINIBAND_PEER_MEMORY typedef void (*umem_invalidate_func_t)(void *invalidation_cookie, struct ib_umem_ex *umem_ex, unsigned long addr, size_t size); diff --git a/drivers/infiniband/hw/xsc/mr.c b/drivers/infiniband/hw/xsc/mr.c index dac492579e25..a44189791ab5 100644 --- a/drivers/infiniband/hw/xsc/mr.c +++ b/drivers/infiniband/hw/xsc/mr.c @@ -14,7 +14,7 @@ #include "ib_umem_ex.h" #include "xsc_ib.h" -#ifndef MLX_PEER_SUPPORT +#ifndef CONFIG_INFINIBAND_PEER_MEMORY static void xsc_invalidate_umem(void *invalidation_cookie, struct ib_umem_ex *umem, unsigned long addr, size_t size); @@ -166,10 +166,17 @@ struct ib_mr *xsc_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, xsc_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx\n", start, virt_addr, length); +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + umem = ib_umem_get_peer(&dev->ib_dev, start, length, + access_flags, IB_PEER_MEM_INVAL_SUPP); +#else umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); +#endif if (IS_ERR(umem)) { // check client peer memory -#ifndef MLX_PEER_SUPPORT +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + return (void *)umem; +#else u8 peer_exists = 0; umem_ex = ib_client_umem_get(pd->uobject->context, @@ -191,17 +198,17 @@ struct ib_mr *xsc_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (err) goto error; using_peer_mem = 1; -#else - xsc_ib_warn(dev, "umem get failed\n"); - return (void *)umem; #endif - } else { umem_ex = ib_umem_ex(umem); if (IS_ERR(umem_ex)) { err = -ENOMEM; goto error; } +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + if (umem->is_peer) + using_peer_mem = 1; +#endif } umem = &umem_ex->umem; @@ -303,7 +310,7 @@ xsc_ib_dereg_mr_def() return 0; } -#ifndef MLX_PEER_SUPPORT +#ifndef CONFIG_INFINIBAND_PEER_MEMORY static void xsc_invalidate_umem(void *invalidation_cookie, struct ib_umem_ex *umem, unsigned long addr, diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 565a85044541..324cc50dbc15 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -25,6 +25,10 @@ struct ib_umem { u32 writable : 1; u32 is_odp : 1; u32 is_dmabuf : 1; +#ifdef CONFIG_INFINIBAND_PEER_MEMORY + /* Placing at the end of the bitfield list is ABI preserving on LE */ + u32 is_peer : 1; +#endif struct sg_append_table sgt_append; }; @@ -45,6 +49,14 @@ static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem) return container_of(umem, struct ib_umem_dmabuf, umem); } +#ifdef CONFIG_INFINIBAND_PEER_MEMORY +typedef void (*umem_invalidate_func_t)(struct ib_umem *umem, void *priv); +enum ib_peer_mem_flags { + IB_PEER_MEM_ALLOW = 1 << 0, + IB_PEER_MEM_INVAL_SUPP = 1 << 1, +}; +#endif + /* Returns the offset of the umem start relative to the first page. */ static inline int ib_umem_offset(struct ib_umem *umem) { @@ -153,6 +165,15 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); +#ifdef CONFIG_INFINIBAND_PEER_MEMORY +struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, + size_t size, int access, + unsigned long peer_mem_flags); +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *cookie); +void ib_umem_stop_invalidation_notifier(struct ib_umem *umem); +#endif #else /* CONFIG_INFINIBAND_USER_MEM */ @@ -202,6 +223,24 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) } static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { } +#ifdef CONFIG_INFINIBAND_PEER_MEMORY +static inline struct ib_umem *ib_umem_get_peer(struct ib_device *device, + unsigned long addr, size_t size, + int access, + unsigned long peer_mem_flags) +{ + return ERR_PTR(-EINVAL); +} +static inline void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *cookie) +{ +} + +static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem) +{ +} +#endif #endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h new file mode 100644 index 000000000000..aa29b3ffb1c4 --- /dev/null +++ b/include/rdma/peer_mem.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + * Copyright (C) 2020-2021 NVIDIA CORPORATION & AFFILIATES. All Rights Reserved. + */ +#ifndef RDMA_PEER_MEM_H +#define RDMA_PEER_MEM_H + +#include + +#define IB_PEER_MEMORY_NAME_MAX 64 +#define IB_PEER_MEMORY_VER_MAX 16 + +/* + * Prior versions used a void * for core_context, at some point this was + * switched to use u64. Be careful if compiling this as 32 bit. To help the + * value of core_context is limited to u32 so it should work OK despite the + * type change. + */ +#define PEER_MEM_U64_CORE_CONTEXT + +struct device; + +/** + * struct peer_memory_client - registration information for user virtual + * memory handlers + * + * The peer_memory_client scheme allows a driver to register with the ib_umem + * system that it has the ability to understand user virtual address ranges + * that are not compatible with get_user_pages(). For instance VMAs created + * with io_remap_pfn_range(), or other driver special VMA. + * + * For ranges the interface understands it can provide a DMA mapped sg_table + * for use by the ib_umem, allowing user virtual ranges that cannot be + * supported by get_user_pages() to be used as umems. + */ +struct peer_memory_client { + char name[IB_PEER_MEMORY_NAME_MAX]; + char version[IB_PEER_MEMORY_VER_MAX]; + + /** + * acquire - Begin working with a user space virtual address range + * + * @addr - Virtual address to be checked whether belongs to peer. + * @size - Length of the virtual memory area starting at addr. + * @peer_mem_private_data - Obsolete, always NULL + * @peer_mem_name - Obsolete, always NULL + * @client_context - Returns an opaque value for this acquire use in + * other APIs + * + * Returns 1 if the peer_memory_client supports the entire virtual + * address range, 0 or -ERRNO otherwise. If 1 is returned then + * release() will be called to release the acquire(). + */ + int (*acquire)(unsigned long addr, size_t size, + void *peer_mem_private_data, char *peer_mem_name, + void **client_context); + /** + * get_pages - Fill in the first part of a sg_table for a virtual + * address range + * + * @addr - Virtual address to be checked whether belongs to peer. + * @size - Length of the virtual memory area starting at addr. + * @write - Always 1 + * @force - 1 if write is required + * @sg_head - Obsolete, always NULL + * @client_context - Value returned by acquire() + * @core_context - Value to be passed to invalidate_peer_memory for + * this get + * + * addr/size are passed as the raw virtual address range requested by + * the user, it is not aligned to any page size. get_pages() is always + * followed by dma_map(). + * + * Upon return the caller can call the invalidate_callback(). + * + * Returns 0 on success, -ERRNO on failure. After success put_pages() + * will be called to return the pages. + */ + int (*get_pages)(unsigned long addr, size_t size, int write, int force, + struct sg_table *sg_head, void *client_context, + u64 core_context); + /** + * dma_map - Create a DMA mapped sg_table + * + * @sg_head - The sg_table to allocate + * @client_context - Value returned by acquire() + * @dma_device - The device that will be doing DMA from these addresses + * @dmasync - Obsolete, always 0 + * @nmap - Returns the number of dma mapped entries in the sg_head + * + * Must be called after get_pages(). This must fill in the sg_head with + * DMA mapped SGLs for dma_device. Each SGL start and end must meet a + * minimum alignment of at least PAGE_SIZE, though individual sgls can + * be multiples of PAGE_SIZE, in any mixture. Since the user virtual + * address/size are not page aligned, the implementation must increase + * it to the logical alignment when building the SGLs. + * + * Returns 0 on success, -ERRNO on failure. After success dma_unmap() + * will be called to unmap the pages. On failure sg_head must be left + * untouched or point to a valid sg_table. + */ + int (*dma_map)(struct sg_table *sg_head, void *client_context, + struct device *dma_device, int dmasync, int *nmap); + /** + * dma_unmap - Unmap a DMA mapped sg_table + * + * @sg_head - The sg_table to unmap + * @client_context - Value returned by acquire() + * @dma_device - The device that will be doing DMA from these addresses + * + * sg_head will not be touched after this function returns. + * + * Must return 0. + */ + int (*dma_unmap)(struct sg_table *sg_head, void *client_context, + struct device *dma_device); + /** + * put_pages - Unpin a SGL + * + * @sg_head - The sg_table to unpin + * @client_context - Value returned by acquire() + * + * sg_head must be freed on return. + */ + void (*put_pages)(struct sg_table *sg_head, void *client_context); + /* Client should always return PAGE_SIZE */ + unsigned long (*get_page_size)(void *client_context); + /** + * release - Undo acquire + * + * @client_context - Value returned by acquire() + * + * If acquire() returns 1 then release() must be called. All + * get_pages() and dma_map()'s must be undone before calling this + * function. + */ + void (*release)(void *client_context); +}; + +enum { + PEER_MEM_INVALIDATE_UNMAPS = 1 << 0, +}; + +struct peer_memory_client_ex { + struct peer_memory_client client; + size_t ex_size; + u32 flags; +}; + +/* + * If invalidate_callback() is non-NULL then the client will only support + * umems which can be invalidated. The caller may call the + * invalidate_callback() after acquire() on return the range will no longer + * have DMA active, and release() will have been called. + * + * Note: The implementation locking must ensure that get_pages(), and + * dma_map() do not have locking dependencies with invalidate_callback(). The + * ib_core will wait until any concurrent get_pages() or dma_map() completes + * before returning. + * + * Similarly, this can call dma_unmap(), put_pages() and release() from within + * the callback, or will wait for another thread doing those operations to + * complete. + * + * For these reasons the user of invalidate_callback() must be careful with + * locking. + */ +typedef int (*invalidate_peer_memory)(void *reg_handle, u64 core_context); + +void * +ib_register_peer_memory_client(const struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback); +void ib_unregister_peer_memory_client(void *reg_handle); + +#endif -- Gitee From ec7901a322de3512567c1962bff2c3bd4a0a935d Mon Sep 17 00:00:00 2001 From: tianx Date: Wed, 21 Aug 2024 18:46:26 +0800 Subject: [PATCH 2/2] drivers: Fix kabi check failure yunsilicon inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IALL3Y CVE: NA ------------------------------------------ Fix kabi check failure Fixes: a2fe6fdfc37c ("drivers: Add GDR(GPU Direct RDMA) support") Reviewed-by: Wei Honggang Reviewed-by: Wang Saochuang Signed-off-by: Tian Xin --- include/rdma/ib_umem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 324cc50dbc15..bb133a973d2a 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -26,8 +26,10 @@ struct ib_umem { u32 is_odp : 1; u32 is_dmabuf : 1; #ifdef CONFIG_INFINIBAND_PEER_MEMORY +#ifndef __GENKSYMS__ /* Placing at the end of the bitfield list is ABI preserving on LE */ u32 is_peer : 1; +#endif #endif struct sg_append_table sgt_append; }; -- Gitee