From e890e0f6ded0f9d8701ae97646be61e4c9d57ccc Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 29 May 2021 07:03:30 +0000 Subject: [PATCH 01/11] mm: export symbol find_get_task_by_vpid euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX Reference: https://jpbrucker.net/git/linux/commit/?h=sva/2021-03-01&id=176b3454e53ffdd4a4f6dbc51f128f1f35a1b357 --------------------------------------------- Userspace drivers implemented with VFIO might want to bind sub-processes to their devices. In a VFIO ioctl, they provide a pid that is used to find a task and its mm. Since VFIO can be built as a module, export the find_get_task_by_vpid symbol. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Lijun Fang Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai [ Ma Wupeng: cherry pick from OLK-5.10 ] Signed-off-by: Ma Wupeng --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/pid.c b/kernel/pid.c index f93954a0384d..69089222d8d8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -434,6 +434,7 @@ struct task_struct *find_get_task_by_vpid(pid_t nr) return task; } +EXPORT_SYMBOL_GPL(find_get_task_by_vpid); struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { -- Gitee From 033ce181d678782a8be371057aea7dfa8a129705 Mon Sep 17 00:00:00 2001 From: Weixi Zhu Date: Sat, 26 Aug 2023 09:04:58 +0800 Subject: [PATCH 02/11] mm: gmem: Introduce CONFIG_GMEM euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Introduce config GMEM in preparation for isolation code for gmem. Signed-off-by: Weixi Zhu --- mm/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index 7672a22647b4..7bf98487c1cc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1206,6 +1206,14 @@ config PER_VMA_LOCK This feature allows locking each virtual memory area separately when handling page faults instead of taking mmap_lock. +config GMEM + bool "gmem subsystem for multi-MMU cooperative management" + depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + default y + help + say Y here to enable gmem subsystem + source "mm/damon/Kconfig" endmenu -- Gitee From afdebb5a6a366d3efad5f4a85f75bdc7279350cc Mon Sep 17 00:00:00 2001 From: Jiangtian Feng Date: Sat, 26 Aug 2023 09:27:19 +0800 Subject: [PATCH 03/11] mm: gmem: Introduce new node state N_HETEROGENEOUS euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Introduce new node state N_HETEROGENEOUS to indicate heterogeneous memory devices. Co-developed-by: liuzixian Signed-off-by: Jiangtian Feng --- drivers/base/node.c | 6 ++++++ include/linux/nodemask.h | 10 ++++++++++ mm/page_alloc.c | 3 +++ 3 files changed, 19 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index b46db17124f3..4943a25cc272 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -922,6 +922,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), +#endif }; static struct attribute *node_state_attrs[] = { @@ -934,6 +937,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_GMEM + &node_state_attr[N_HETEROGENEOUS].attr.attr, +#endif NULL }; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bb0ee80526b2..39820123d793 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,6 +407,9 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_GMEM + N_HETEROGENEOUS, /* The node has heterogeneous memory */ +#endif NR_NODE_STATES }; @@ -536,6 +539,13 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#ifdef CONFIG_GMEM +/* For h-NUMA topology */ +#define hnode_map node_states[N_HETEROGENEOUS] +#define num_hnodes() num_node_state(N_HETEROGENEOUS) +#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) +#endif + /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc12b..90762bee9730 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -216,6 +216,9 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = NODE_MASK_NONE, +#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM -- Gitee From 2f35a92a2074ede487cad370bd0f3f888678aa9a Mon Sep 17 00:00:00 2001 From: Yang Yanchao Date: Sat, 26 Aug 2023 09:47:00 +0800 Subject: [PATCH 04/11] mm: gmem: Introduce gmem related madvise opcode euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Introduce new madvise opcode in preparation for hmadvise: MADV_PREFETCH: prefetch pages for hNUMA node MADV_PINNED: pin pages In order to avoid conflict to existing or new madvise opcode, make the new one begin with 0x1000. Signed-off-by: Yang Yanchao --- include/uapi/asm-generic/mman-common.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..029c717e6755 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,11 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* for hmadvise */ +#define MADV_GMEM_BASE 0x1000 +#define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ +#define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ + /* compatibility flags */ #define MAP_FILE 0 -- Gitee From 232752caeae541bad5a96d7a26aa50b4732eaf7b Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Sat, 26 Aug 2023 15:18:16 +0800 Subject: [PATCH 05/11] mm: gmem: Introduce vm_object in preparation for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Defines a centrailized logical mapping table that reflects the mapping information regardless of the underlying arch-specific MMUs. Co-developed-by: Yang Kunlin Signed-off-by: Liu Chao --- include/linux/mm_types.h | 41 +++++++++++++++++++++++++++++++++++++++ include/linux/vm_object.h | 12 ++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 include/linux/vm_object.h diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 306a3d1a0fa6..bd46194056ab 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -465,6 +465,44 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +#ifdef CONFIG_GMEM +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; +typedef struct vm_object vm_object_t; +#endif + struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ @@ -571,6 +609,9 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_GMEM + struct vm_object *vm_obj; +#endif } __randomize_layout; #ifdef CONFIG_SCHED_MM_CID diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h new file mode 100644 index 000000000000..e3a81b2a0d5c --- /dev/null +++ b/include/linux/vm_object.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VM_OBJECT_H +#define _VM_OBJECT_H + +#ifdef CONFIG_GMEM + +static inline struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) { return NULL; } +static inline int vm_object_mapping_create(vm_object_t *obj, gm_va_t start) { return 0; } + +#endif + +#endif /* _VM_OBJECT_H */ -- Gitee From 46a7894b5e4c6b5a39d6e1c0197989557e967d76 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Sat, 26 Aug 2023 16:22:05 +0800 Subject: [PATCH 06/11] mm: gmem: Introduce GMEM euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- The functions of the GMEM can be summarized as follows: First, the accelerator driver can directly reuse the core VM code of Linux by providing the MMU operation function required by the GMEM, and no independent memory management mechanism is required. Second, the GMEM can coordinate a page table between multiple heterogeneous MMUs, so as to implement memory coherence (memory coherence) between the CPU and the accelerator in a same address space. From a kernel point of view, the driver code for memory management with repetitive functions is greatly reduced. From the perspective of driver programming, the development and maintenance workload of driver code is greatly reduced. From the perspective of application development, the same address space greatly reduces programming complexity, while GMEM provides heterogeneous memory semantics to enhance flexibility and ease of use in performance tuning. To enable gmem, add "gmem=on" in kernel commandline. Co-developed-by: Yang Yanchao Co-developed-by: luochunsheng Co-developed-by: Weixi Zhu Signed-off-by: Bin Wang --- include/linux/gmem.h | 387 +++++++++++++++++++++++++ include/linux/gmem_as.h | 40 +++ include/linux/mm.h | 11 + include/linux/mm_types.h | 4 + include/linux/vm_object.h | 1 + mm/Makefile | 1 + mm/gmem.c | 574 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 1018 insertions(+) create mode 100644 include/linux/gmem.h create mode 100644 include/linux/gmem_as.h create mode 100644 mm/gmem.c diff --git a/include/linux/gmem.h b/include/linux/gmem.h new file mode 100644 index 000000000000..fb1219394f79 --- /dev/null +++ b/include/linux/gmem.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ +#ifndef _GMEM_H +#define _GMEM_H + +#include + +typedef unsigned long gm_region_placement_t; +typedef unsigned long gm_prot_t; +typedef enum gm_ret gm_ret_t; +typedef struct gm_region gm_region_t; +typedef struct gm_mapping_set gm_mapping_set_t; +typedef enum gm_mmu_mode gm_mmu_mode_t; +typedef struct gm_mmu gm_mmu_t; +typedef unsigned long gm_dev_cap_t; +typedef struct gm_context gm_context_t; +typedef struct gm_dev gm_dev_t; +typedef struct gm_mapping gm_mapping_t; + +struct hnode; + +/* + * enum gm_ret - The return value of GMEM KPI that can be used to tell + * the core VM or peripheral driver whether the GMEM KPI was + * executed successfully. + * + * @GM_RET_SUCCESS: The invoked GMEM KPI behaved as expected. + * @GM_RET_FAILURE_UNKNOWN: The GMEM KPI failed with unknown reason. + * Any external status related to this KPI invocation changes must be rolled back. + */ +enum gm_ret { + GM_RET_SUCCESS = 0, + GM_RET_NOMEM, + GM_RET_PAGE_EXIST, + GM_RET_DMA_ERROR, + GM_RET_MIGRATING, + GM_RET_FAILURE_UNKNOWN, + GM_RET_UNIMPLEMENTED, +}; + +/* + * Defines a contiguous range of virtual addresses inside a gm_as_t + * As an analogy, this is conceptually similar as virtual_address_struct + */ +struct gm_region { + gm_va_t start_va; + gm_va_t end_va; + struct rb_node node; + gm_as_t *as; /* The address space that it belongs to */ + + /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ + struct list_head mapping_set_link; + + void (*callback_op)(void *args); + void *cb_args; +}; + +/* This holds a list of regions that must not be concurrently manipulated. */ +struct gm_mapping_set { + unsigned int region_cnt; + struct list_head gm_region_list; +}; + +/** + * enum gm_mmu_mode - defines the method to share a physical page table. + * + * @GM_MMU_MODE_SHARE: Literally share a physical page table with another + * attached device's MMU. Nothing is guaranteed about the allocated address. + * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds + * exclusive mapping entries, so that device memory accesses can trigger fault-driven + * migration for automatic data locality optimizations. + * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical + * mapping entries whenever a physical mapping is installed inside the address space, so + * that it may minimize the page faults to be triggered by this device. + */ +enum gm_mmu_mode { + GM_MMU_MODE_SHARE, + GM_MMU_MODE_COHERENT_EXCLUSIVE, + GM_MMU_MODE_REPLICATE, +}; + +/* + * This is the parameter list of peer_map/unmap mmu operations. + * if device should copy data to/from host, set copy and dma_addr + */ +struct gm_fault_t { + struct mm_struct *mm; + gm_dev_t *dev; + gm_va_t va; + gm_pa_t size; + gm_prot_t prot; + bool copy; + dma_addr_t dma_addr; + int behavior; +}; + +struct gm_memcpy_t { + struct mm_struct *mm; + gm_dev_t *dev; + gm_va_t src; + gm_va_t dest; + dma_addr_t dma_addr; + size_t size; +}; + +/** + * + * This struct defines a series of MMU functions registered by a peripheral + * device that is to be invoked by GMEM. + * + * pmap is an opaque pointer that identifies a physical page table of a device. + * A physical page table holds the physical mappings that can be interpreted by + * the hardware MMU. + */ +struct gm_mmu { + /* + * Each bit indicates a supported page size for page-based TLB. + * Currently we do not consider range TLBs. + */ + unsigned long pgsize_bitmap; + + /* + * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, + * then it means their page table formats are compatible. + * In that case, they can share the same void *pmap as the input arg. + */ + unsigned long cookie; + + /* Synchronize VMA in a peer OS to interact with the host OS */ + gm_ret_t (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); + gm_ret_t (*peer_va_free)(struct gm_fault_t *gmf); + + /* Create physical mappings on peer host. + * If copy is set, copy data [dma_addr, dma_addr + size] to peer host + */ + gm_ret_t (*peer_map)(struct gm_fault_t *gmf); + /* + * Destroy physical mappings on peer host. + * If copy is set, copy data back to [dma_addr, dma_addr + size] + */ + gm_ret_t (*peer_unmap)(struct gm_fault_t *gmf); + + /* Create or destroy a device's physical page table. */ + gm_ret_t (*pmap_create)(gm_dev_t *dev, void **pmap); + gm_ret_t (*pmap_destroy)(void *pmap); + + /* Create or destroy a physical mapping of a created physical page table */ + gm_ret_t (*pmap_enter)(void *pmap, gm_va_t va, gm_va_t size, + gm_pa_t pa, gm_prot_t prot); + gm_ret_t (*pmap_release)(void *pmap, gm_va_t va, gm_va_t size); + + /* Change the protection of a virtual page */ + gm_ret_t (*pmap_protect)(void *pmap, gm_va_t va, gm_va_t size, gm_prot_t new_prot); + + /* Invalidation functions of the MMU TLB */ + gm_ret_t (*tlb_invl)(void *pmap, gm_va_t va, gm_va_t size); + gm_ret_t (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); +}; + +/** + * gm_dev_cap_t defines a composable flag to describe the capabilities of a device. + * + * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. + * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS + */ +#define GM_DEV_CAP_REPLAYABLE 0x00000001 +#define GM_DEV_CAP_PEER 0x00000010 + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +struct gm_context { + gm_as_t *as; + gm_dev_t *dev; + void *pmap; + /* + * consider a better container to maintain multiple ctx inside a device or multiple ctx + * inside a va space. + * A device may simultaneously have multiple contexts for time-sliced ctx switching + */ + struct list_head gm_dev_link; + + /* A va space may have multiple gm_context */ + struct list_head gm_as_link; +}; +#define get_gm_context(head) (list_entry((head)->prev, gm_context_t, ctx_link)) + +struct gm_dev { + int id; + + /* identifies the device capability + * For example, whether the device supports page faults or whether it has its + * own OS that manages the VA and PA resources. + */ + gm_dev_cap_t capability; + gm_mmu_t *mmu; + void *dev_data; + /* + * TODO: Use a better container of gm_context_t to support time-sliced context switch. + * A collection of device contexts. If the device does not support time-sliced context + * switch, then the size of the collection should never be greater than one. + * We need to think about what operators should the container be optimized for. + * A list, a radix-tree or what? What would gm_dev_activate require? + * Are there any accelerators that are really going to support time-sliced context switch? + */ + gm_context_t *current_ctx; + + struct list_head gm_ctx_list; + + /* Add tracking of registered device local physical memory. */ + nodemask_t registered_hnodes; + struct device *dma_dev; + + gm_mapping_t *gm_mapping; +}; + +#define HOST_NODE_ID (-1) + +#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ +#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_PAGE_DEVICE 0x20 +#define GM_PAGE_NOMAP 0x40 +#define GM_PAGE_PINNED 0x80 +#define GM_PAGE_WILLNEED 0x100 + +#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) + +/* Records the status of a page-size physical page */ +struct gm_mapping { + /* + * The node index may have three definitions: + * 1. a common CPU node + * 2. a hetero-node, e.g. GPU (that not necessarily supports CC ld/st) + * 3. a network ip (another OS that may have multiple hNUMA nodes), dynamically attached by dsm_attach + * Among these definitions, #1 and #2 in combination defines an h-NUMA topology + */ + unsigned int node_id; + + unsigned int flag; + + union { + struct page *page; /* CPU node */ + gm_dev_t *dev; /* hetero-node */ + gm_pa_t pfn; + }; + + struct mutex lock; +}; + +static inline bool gm_mapping_cpu(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_CPU); +} + +static inline void set_gm_mapping_host(gm_mapping_t *gm_mapping, struct page *page) +{ + gm_mapping->node_id = HOST_NODE_ID; + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + gm_mapping->flag |= GM_PAGE_CPU; + gm_mapping->page = page; +} + +static inline bool gm_mapping_device(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_DEVICE); +} + +static inline void set_gm_mapping_device(gm_mapping_t *gm_mapping, gm_dev_t *dev) +{ + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + gm_mapping->flag |= GM_PAGE_DEVICE; + gm_mapping->dev = dev; +} + +static inline bool gm_mapping_nomap(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_NOMAP); +} + +static inline void set_gm_mapping_nomap(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + gm_mapping->flag |= GM_PAGE_NOMAP; + gm_mapping->page = NULL; +} + +static inline void set_gm_mapping_willneed(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag |= GM_PAGE_WILLNEED; +} + +static inline void clear_gm_mapping_willneed(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag &= ~GM_PAGE_WILLNEED; +} + +static inline bool gm_mapping_willneed(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_WILLNEED); +} + +static inline void set_gm_mapping_pinned(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag |= GM_PAGE_PINNED; +} + +static inline void clear_gm_mapping_pinned(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag &= ~GM_PAGE_PINNED; +} + +static inline bool gm_mapping_pinned(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_PINNED); +} + +#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } + +/* GMEM Device KPI */ +extern gm_ret_t gm_dev_create(gm_mmu_t *mmu, void *dev_data, gm_dev_cap_t cap, gm_dev_t **new_dev); +extern gm_ret_t gm_dev_destroy(gm_dev_t *dev); +extern gm_ret_t gm_dev_switch(gm_dev_t *dev, gm_as_t *as); +extern gm_ret_t gm_dev_detach(gm_dev_t *dev, gm_as_t *as); +extern gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end); +gm_ret_t gm_dev_fault(struct mm_struct *mm, gm_va_t addr, gm_dev_t *dev, int behavior); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size); + +/* GMEM address space KPI */ +extern gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end); +extern void gm_dev_unregister_physmem(gm_dev_t *dev, unsigned int nid); +extern gm_mapping_t *gm_mappings_alloc(unsigned int nid, unsigned int order); +extern void gm_mappings_free(gm_mapping_t *mapping, unsigned int order); +extern gm_ret_t gm_as_create(gm_va_t begin, gm_va_t end, gm_as_alloc_t policy, gm_va_t cache_quantum, gm_as_t **new_as); +extern gm_ret_t gm_as_destroy(gm_as_t *as); +extern gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode, bool activate, gm_context_t **out_ctx); +extern gm_va_t gm_as_alloc(gm_as_t *as, gm_va_t hint, gm_va_t size, gm_va_t align, gm_va_t no_cross, + gm_va_t max_va, gm_region_t **new_region); + +enum gmem_stat_item { + NR_PAGE_MIGRATING, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_state_counter(enum gmem_stat_item item, int val); +extern void gmem_state_counter_show(void); + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + + gm_dev_t *dev; + + struct xarray pages; +}; + +extern struct hnode *hnodes[]; + +static inline bool is_hnode(int node) +{ + return !node_isset(node, node_possible_map) + && node_isset(node, hnode_map); +} + +static inline bool is_hnode_allowed(int node) +{ + return is_hnode(node) && node_isset(node, current->mems_allowed); +} + +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +void __init hnuma_init(void); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, gm_dev_t *dev); +void hnode_deinit(unsigned int hnid, gm_dev_t *dev); + +#endif /* _GMEM_H */ diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h new file mode 100644 index 000000000000..f62dac65e2ed --- /dev/null +++ b/include/linux/gmem_as.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _GMEM_AS_H +#define _GMEM_AS_H + +typedef struct gm_as gm_as_t; +typedef unsigned long gm_va_t; +typedef unsigned long gm_pa_t; +typedef enum gm_as_alloc gm_as_alloc_t; + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of gm_as_t */ + struct rb_root rbroot; /*root of gm_region_t */ + gm_as_alloc_t policy; + gm_va_t start_va; + gm_va_t end_va; + gm_va_t cache_quantum; /* defines the VA unit size if an object cache is applied */ + + struct list_head gm_ctx_list; /* tracks device contexts attached to this va space, using gm_as_link */ +}; + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 27ce77080c79..f2b83099948c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3816,4 +3816,15 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif +#ifdef CONFIG_GMEM +DECLARE_STATIC_KEY_FALSE(gmem_status); + +static inline bool gmem_is_enabled(void) +{ + return static_branch_likely(&gmem_status); +} +#else +static inline bool gmem_is_enabled(void) { return false; } +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bd46194056ab..3a1a0036932c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #ifndef AT_VECTOR_SIZE_ARCH diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index e3a81b2a0d5c..0f691ce80028 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -4,6 +4,7 @@ #ifdef CONFIG_GMEM +static inline int __init vm_object_init(void) { return 0; } static inline struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) { return NULL; } static inline int vm_object_mapping_create(vm_object_t *obj, gm_va_t start) { return 0; } diff --git a/mm/Makefile b/mm/Makefile index e29afc890cde..c1c7b88f3184 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -40,6 +40,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o +mmu-$(CONFIG_GMEM) += gmem.o ifdef CONFIG_CROSS_MEMORY_ATTACH diff --git a/mm/gmem.c b/mm/gmem.c new file mode 100644 index 000000000000..8e8b1561f59f --- /dev/null +++ b/mm/gmem.c @@ -0,0 +1,574 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(gmem_status); +EXPORT_SYMBOL_GPL(gmem_status); + +static struct kmem_cache *gm_as_cache; +static struct kmem_cache *gm_dev_cache; +static struct kmem_cache *gm_ctx_cache; +static struct kmem_cache *gm_region_cache; +static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); + +static bool enable_gmem; + +static inline unsigned long pe_mask(enum page_entry_size pe_size) +{ + if (pe_size == PE_SIZE_PTE) + return PAGE_MASK; + if (pe_size == PE_SIZE_PMD) + return HPAGE_PMD_MASK; + if (pe_size == PE_SIZE_PUD) + return HPAGE_PUD_MASK; + return ~0; +} + +static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; + +void gmem_state_counter(enum gmem_stat_item item, int val) +{ + if (!gmem_is_enabled()) + return; + + if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS))) + return; + + percpu_counter_add(&g_gmem_stats[item], val); +} + +#ifdef CONFIG_PROC_FS +static int gmemstat_show(struct seq_file *m, void *arg) +{ + if (!gmem_is_enabled()) + return 0; + + seq_printf(m, "migrating : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING])); + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static struct workqueue_struct *prefetch_wq; + +#define GM_WORK_CONCURRENCY 4 + +static int __init gmem_init(void) +{ + int err = -ENOMEM; + + if (!enable_gmem) + return 0; + + gm_as_cache = KMEM_CACHE(gm_as, 0); + if (!gm_as_cache) + goto out; + + gm_dev_cache = KMEM_CACHE(gm_dev, 0); + if (!gm_dev_cache) + goto free_as; + + gm_ctx_cache = KMEM_CACHE(gm_context, 0); + if (!gm_ctx_cache) + goto free_dev; + + gm_region_cache = KMEM_CACHE(gm_region, 0); + if (!gm_region_cache) + goto free_ctx; + + err = vm_object_init(); + if (err) + goto free_ctx; + + prefetch_wq = alloc_workqueue("prefetch", + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); + if (!prefetch_wq) { + pr_info("fail to alloc workqueue prefetch_wq\n"); + err = -EFAULT; + goto free_ctx; + } + +#ifdef CONFIG_PROC_FS + proc_create_single("gmemstat", 0444, NULL, gmemstat_show); +#endif + + static_branch_enable(&gmem_status); + + return 0; + +free_ctx: + kmem_cache_destroy(gm_ctx_cache); +free_dev: + kmem_cache_destroy(gm_dev_cache); +free_as: + kmem_cache_destroy(gm_as_cache); +out: + return -ENOMEM; +} +subsys_initcall(gmem_init); + +static int __init setup_gmem(char *str) +{ + strtobool(str, &enable_gmem); + + return 1; +} +__setup("gmem=", setup_gmem); + +/* + * Create a GMEM device, register its MMU function and the page table. + * The returned device pointer will be passed by new_dev. + * A unique id will be assigned to the GMEM device, using Linux's xarray. + */ +gm_ret_t gm_dev_create(gm_mmu_t *mmu, void *dev_data, gm_dev_cap_t cap, gm_dev_t **new_dev) +{ + gm_dev_t *dev; + + if (!gmem_is_enabled()) + return GM_RET_FAILURE_UNKNOWN; + + dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); + if (!dev) + return GM_RET_NOMEM; + + if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, GFP_KERNEL)) { + kmem_cache_free(gm_dev_cache, dev); + return GM_RET_NOMEM; + } + + dev->capability = cap; + dev->mmu = mmu; + dev->dev_data = dev_data; + dev->current_ctx = NULL; + INIT_LIST_HEAD(&dev->gm_ctx_list); + *new_dev = dev; + nodes_clear(dev->registered_hnodes); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_create); + +/* Handle the page fault triggered by a given device */ +gm_ret_t gm_dev_fault(struct mm_struct *mm, gm_va_t addr, gm_dev_t *dev, int behavior) +{ + gm_ret_t ret = GM_RET_SUCCESS; + gm_mmu_t *mmu = dev->mmu; + struct device *dma_dev = dev->dma_dev; + struct vm_area_struct *vma; + vm_object_t *obj; + gm_mapping_t *gm_mapping; + gm_va_t size = HPAGE_SIZE; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior + }; + struct page *page = NULL; + + mmap_read_lock(mm); + + vma = find_vma(mm, addr); + if (!vma) { + pr_info("gmem: %s no vma\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + obj = vma->vm_obj; + if (!obj) { + pr_info("gmem: %s no vm_obj\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + vm_object_mapping_create(obj, addr); + gm_mapping = vm_object_lookup(obj, addr); + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + goto peer_map; + } else if (gm_mapping_device(gm_mapping)) { + if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { + goto peer_map; + } else { + ret = 0; + goto unlock; + } + } else if (gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (!page) { + pr_err("gmem: host gm_mapping page is NULL. Set nomap\n"); + set_gm_mapping_nomap(gm_mapping); + goto unlock; + } + get_page(page); + zap_page_range_single(vma, addr, size, NULL); + gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) + pr_info("gmem: dma map failed\n"); + + gmf.copy = true; + } + +peer_map: + ret = mmu->peer_map(&gmf); + if (ret != GM_RET_SUCCESS) { + if (ret == GM_RET_MIGRATING) { + /* + * gmem page is migrating due to overcommit. + * update page to willneed and this will stop page evicting + */ + set_gm_mapping_willneed(gm_mapping); + gmem_state_counter(NR_PAGE_MIGRATING, 1); + ret = GM_RET_SUCCESS; + } else { + pr_err("gmem: peer map failed\n"); + if (page) { + set_gm_mapping_nomap(gm_mapping); + put_page(page); + } + } + goto unlock; + } + + if (page) { + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + put_page(page); + } + + set_gm_mapping_device(gm_mapping, dev); +unlock: + mutex_unlock(&gm_mapping->lock); +mmap_unlock: + mmap_read_unlock(mm); + return ret; +} +EXPORT_SYMBOL_GPL(gm_dev_fault); + +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size) +{ + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & pe_mask(pe_size); + vm_object_t *obj = vma->vm_obj; + gm_mapping_t *gm_mapping; + gm_va_t size = HPAGE_SIZE; + gm_dev_t *dev; + struct device *dma_dev; + struct gm_fault_t gmf = { + .mm = vma->vm_mm, + .va = addr, + .size = size, + .copy = true, + }; + + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + pr_err("gmem: host fault gm_mapping should not be NULL\n"); + return VM_FAULT_SIGBUS; + } + + dev = gm_mapping->dev; + gmf.dev = dev; + dma_dev = dev->dma_dev; + gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + pr_err("gmem: host fault dma mapping error\n"); + return VM_FAULT_SIGBUS; + } + if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { + pr_err("gmem: peer unmap failed\n"); + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return VM_FAULT_SIGBUS; + } + + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return ret; +} + +/* + * Register the local physical memory of a gmem device. + * This implies dynamically creating + * the struct page data structures. + */ +gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end) +{ + gm_mapping_t *mapping; + gm_pa_t addr = PAGE_ALIGN(begin); + unsigned int nid; + int i, page_num = (end - addr) >> PAGE_SHIFT; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + + if (!hnode) + goto err; + + nid = alloc_hnode_id(); + if (nid == MAX_NUMNODES) + goto free_hnode; + hnode_init(hnode, nid, dev); + + mapping = kvmalloc(sizeof(gm_mapping_t) * page_num, GFP_KERNEL); + if (!mapping) + goto deinit_hnode; + + for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { + mapping[i].node_id = hnode->id; + mapping[i].pfn = addr >> PAGE_SHIFT; + mapping[i].flag = 0; + } + + xa_lock(&hnode->pages); + for (i = 0; i < page_num; i++) { + if (xa_err(__xa_store(&hnode->pages, i, mapping + i, GFP_KERNEL))) { + /* Probably nomem */ + kvfree(mapping); + xa_unlock(&hnode->pages); + goto deinit_hnode; + } + __xa_set_mark(&hnode->pages, i, XA_MARK_0); + } + xa_unlock(&hnode->pages); + + return GM_RET_SUCCESS; + +deinit_hnode: + hnode_deinit(nid, dev); + free_hnode_id(nid); +free_hnode: + kfree(hnode); +err: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gm_dev_register_physmem); + +void gm_dev_unregister_physmem(gm_dev_t *dev, unsigned int nid) +{ + struct hnode *hnode = get_hnode(nid); + gm_mapping_t *mapping = xa_load(&hnode->pages, 0); + + kvfree(mapping); + hnode_deinit(nid, dev); + free_hnode_id(nid); + kfree(hnode); +} +EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); + +gm_mapping_t *gm_mappings_alloc(unsigned int nid, unsigned int order) +{ + gm_mapping_t *mapping; + struct hnode *node = get_hnode(nid); + XA_STATE(xas, &node->pages, 0); + + /* TODO: support order > 0 */ + if (order != 0) + return ERR_PTR(-EINVAL); + + xa_lock(&node->pages); + mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0); + if (!mapping) { + xa_unlock(&node->pages); + return ERR_PTR(-ENOMEM); + } + + xas_clear_mark(&xas, XA_MARK_0); + xa_unlock(&node->pages); + + return mapping; +} +EXPORT_SYMBOL_GPL(gm_mappings_alloc); + +void gm_mappings_free(gm_mapping_t *mapping, unsigned int order) +{ + gm_mapping_t *entry; + struct hnode *node = get_hnode(mapping->node_id); + XA_STATE(xas, &node->pages, 0); + + /* TODO: support order > 0 */ + if (order != 0) + return; + + xas_for_each(&xas, entry, ULONG_MAX) { + if (entry == mapping) { + xas_set_mark(&xas, XA_MARK_0); + break; + } + } +} +EXPORT_SYMBOL_GPL(gm_mappings_free); + +/* GMEM Virtual Address Space API */ +gm_ret_t gm_as_create(gm_va_t begin, gm_va_t end, gm_as_alloc_t policy, + gm_va_t cache_quantum, gm_as_t **new_as) +{ + gm_as_t *as; + + if (!new_as) + return -EINVAL; + + as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC); + if (!as) + return -ENOMEM; + + spin_lock_init(&as->rbtree_lock); + as->rbroot = RB_ROOT; + as->start_va = begin; + as->end_va = end; + as->policy = policy; + + INIT_LIST_HEAD(&as->gm_ctx_list); + + *new_as = as; + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_create); + +gm_ret_t gm_as_destroy(gm_as_t *as) +{ + gm_context_t *ctx, *tmp_ctx; + + list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link) + kfree(ctx); + + kmem_cache_free(gm_as_cache, as); + + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_destroy); + +gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode, + bool activate, gm_context_t **out_ctx) +{ + gm_context_t *ctx; + int nid; + int ret; + + ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); + if (!ctx) + return GM_RET_NOMEM; + + ctx->as = as; + ctx->dev = dev; + ctx->pmap = NULL; + ret = dev->mmu->pmap_create(dev, &ctx->pmap); + if (ret) { + kmem_cache_free(gm_ctx_cache, ctx); + return ret; + } + + INIT_LIST_HEAD(&ctx->gm_dev_link); + INIT_LIST_HEAD(&ctx->gm_as_link); + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); + list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); + + if (activate) { + /* + * Here we should really have a callback function to perform the context switch + * for the hardware. E.g. in x86 this function is effectively flushing the CR3 value. + * Currently we do not care time-sliced context switch, unless someone wants to support it. + */ + dev->current_ctx = ctx; + } + *out_ctx = ctx; + + /* + * gm_as_attach will be used to attach device to process address space. + * Handle this case and add hnodes registered by device to process mems_allowed. + */ + for_each_node_mask(nid, dev->registered_hnodes) + node_set(nid, current->mems_allowed); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_attach); + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + for_each_node(node) + node_set(node, hnode_map); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + spin_unlock(&hnode_lock); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, gm_dev_t *dev) +{ + hnodes[hnid] = hnode; + hnodes[hnid]->id = hnid; + hnodes[hnid]->dev = dev; + node_set(hnid, dev->registered_hnodes); + xa_init(&hnodes[hnid]->pages); +} + +void hnode_deinit(unsigned int hnid, gm_dev_t *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + xa_destroy(&hnodes[hnid]->pages); + hnodes[hnid] = NULL; +} -- Gitee From 874d0b39f577dd5bcb627938c94ca269b4a8b9cd Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Sat, 26 Aug 2023 09:59:23 +0800 Subject: [PATCH 07/11] mm: gmem: Add gm_dev in struct device euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Add gm_dev in struct device to keep track on gmem device. Co-developed-by: Jiangtian Feng Co-developed-by: luochunsheng Signed-off-by: Chen Jun --- include/linux/device.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 472dd24d4823..6a4c901119c8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -32,6 +32,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + struct device; struct device_private; struct device_driver; @@ -655,6 +659,9 @@ struct device { #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif +#ifdef CONFIG_GMEM + gm_dev_t *gm_dev; +#endif }; /** -- Gitee From 9922338f265caf7b56557cbf15ee28be4bb97cc2 Mon Sep 17 00:00:00 2001 From: Weixi Zhu Date: Sat, 26 Aug 2023 10:03:38 +0800 Subject: [PATCH 08/11] mm: gmem: Add gm_as in struct mm_struct euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Add gm_as in struct mm_struct in preparation for find corresponding gmem device used for hmadvise. Signed-off-by: Weixi Zhu --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a1a0036932c..cac73ccf7367 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -847,6 +847,9 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN */ +#ifdef CONFIG_GMEM + gm_as_t *gm_as; +#endif } __randomize_layout; /* -- Gitee From ba4661abb0275afd74219cf75eb50f4bb95244d1 Mon Sep 17 00:00:00 2001 From: luochunsheng Date: Sat, 26 Aug 2023 10:19:47 +0800 Subject: [PATCH 09/11] mm: gmem: Introduce gmem_dev euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Introduce gmem_dev to pass ioctl command between userspace & kernel. Co-developed-by: Ni Cunshu Co-developed-by: Feilong Lin Co-developed-by: Weilong Chen Signed-off-by: luochunsheng --- drivers/char/Kconfig | 7 +++ drivers/char/Makefile | 1 + drivers/char/gmem_dev.c | 92 +++++++++++++++++++++++++++++++++++ include/linux/gmem_dev.h | 7 +++ include/uapi/linux/gmem_dev.h | 17 +++++++ mm/Kconfig | 1 + 6 files changed, 125 insertions(+) create mode 100644 drivers/char/gmem_dev.c create mode 100644 include/linux/gmem_dev.h create mode 100644 include/uapi/linux/gmem_dev.h diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 801d6c83f896..f485063c5818 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -421,4 +421,11 @@ config ADI and SSM (Silicon Secured Memory). Intended consumers of this driver include crash and makedumpfile. +config GMEM_DEV + tristate "driver for gmem" + depends on GMEM + default m + help + driver for gmem in order to pass ioctl commands. + endmenu diff --git a/drivers/char/Makefile b/drivers/char/Makefile index c5f532e412f1..4d01ef364c8b 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -44,3 +44,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_XILLYBUS_CLASS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o +obj-$(CONFIG_GMEM_DEV) += gmem_dev.o diff --git a/drivers/char/gmem_dev.c b/drivers/char/gmem_dev.c new file mode 100644 index 000000000000..702431ce2231 --- /dev/null +++ b/drivers/char/gmem_dev.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static int gmem_get_hnid(unsigned long arg) +{ + void __user *buf = (void __user *)arg; + struct gmem_hnid_arg gmem_hnid; + gm_context_t *ctx, *tmp; + gm_dev_t *gm_dev = NULL; + gm_as_t *as = NULL; + int hnuma_id; + + if (!access_ok(buf, sizeof(struct gmem_hnid_arg))) { + pr_err("access_ok failed\n"); + return -EFAULT; + } + + if (copy_from_user(&gmem_hnid, buf, sizeof(struct gmem_hnid_arg))) { + pr_err("copy_from_user failed.\n"); + return -EFAULT; + } + + if (!current->mm) { + pr_err("current's mm is null.\n"); + return -EFAULT; + } + + as = current->mm->gm_as; + if (!as) { + pr_err("current isn't gmem task failed.\n"); + return -ENODEV; + } + + list_for_each_entry_safe(ctx, tmp, &as->gm_ctx_list, gm_as_link) { + gm_dev = ctx->dev; + if (gm_dev) + break; + } + + if (!gm_dev) { + pr_err("gmem_id_to_device failed.\n"); + return -ENODEV; + } + + hnuma_id = first_node(gm_dev->registered_hnodes); + if (copy_to_user(gmem_hnid.hnuma_id, &hnuma_id, sizeof(int))) { + pr_err("copy_to_user failed.\n"); + return -EFAULT; + } + + return 0; +} + +static long gmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret = 0; + + if (_IOC_TYPE(cmd) != GMEM_MAGIC) { + pr_err("invalid cmd magic number '%#x', should '%#x'.\n", + _IOC_TYPE(cmd), GMEM_MAGIC); + return -EINVAL; + } + + switch (cmd) { + case GMEM_GET_HNUMA_ID: + ret = gmem_get_hnid(arg); + break; + default: + pr_err("invalid cmd '%#x'.\n", cmd); + return -EINVAL; + } + + return ret; +} + +static const struct file_operations gmem_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = gmem_ioctl, + .compat_ioctl = gmem_ioctl, +}; + +static struct miscdevice gmem_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "gmem", + .fops = &gmem_fops, +}; + +builtin_misc_device(gmem_miscdev); diff --git a/include/linux/gmem_dev.h b/include/linux/gmem_dev.h new file mode 100644 index 000000000000..b1359f19b850 --- /dev/null +++ b/include/linux/gmem_dev.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __GMEM_DEV_H +#define __GMEM_DEV_H + +#include + +#endif diff --git a/include/uapi/linux/gmem_dev.h b/include/uapi/linux/gmem_dev.h new file mode 100644 index 000000000000..f7fc5ca57e8c --- /dev/null +++ b/include/uapi/linux/gmem_dev.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_GMEM_DEV_H +#define _UAPI_LINUX_GMEM_DEV_H + +#include + +#define GMEM_MAGIC 0x55 + +#define _GMEM_GET_HNUMA_ID 1 + +struct gmem_hnid_arg { + int *hnuma_id; +}; + +#define GMEM_GET_HNUMA_ID _IOW(GMEM_MAGIC, _GMEM_GET_HNUMA_ID, struct gmem_hnid_arg) + +#endif diff --git a/mm/Kconfig b/mm/Kconfig index 7bf98487c1cc..b950407dd87f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1210,6 +1210,7 @@ config GMEM bool "gmem subsystem for multi-MMU cooperative management" depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE select ARCH_USES_HIGH_VMA_FLAGS + select GMEM_DEV default y help say Y here to enable gmem subsystem -- Gitee From 3e01aec2b2e83166059d6c3ef9d4c534616688be Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Sat, 26 Aug 2023 11:19:04 +0800 Subject: [PATCH 10/11] mm: gmem: Introduce hmadvise euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Introduce hmadvise via ioctl in order to specific gmem behavior. Co-developed-by: Bin Wang Signed-off-by: Ma Wupeng --- drivers/char/gmem_dev.c | 24 +++++ include/linux/gmem.h | 2 + include/linux/mm.h | 9 ++ include/uapi/linux/gmem_dev.h | 10 ++ init/main.c | 8 ++ mm/gmem.c | 189 ++++++++++++++++++++++++++++++++++ 6 files changed, 242 insertions(+) diff --git a/drivers/char/gmem_dev.c b/drivers/char/gmem_dev.c index 702431ce2231..225ed506a9c0 100644 --- a/drivers/char/gmem_dev.c +++ b/drivers/char/gmem_dev.c @@ -55,6 +55,27 @@ static int gmem_get_hnid(unsigned long arg) return 0; } +static int gmem_hmadvise(unsigned long arg) +{ + struct hmadvise_arg harg; + void __user *buf; + int ret; + + buf = (void __user *)arg; + if (!access_ok(buf, sizeof(struct hmadvise_arg))) { + pr_err("access_ok failed.\n"); + return -EFAULT; + } + + if (copy_from_user(&harg, buf, sizeof(struct hmadvise_arg))) { + pr_err("copy_from_user failed.\n"); + return -EFAULT; + } + + ret = hmadvise_inner(harg.hnid, harg.start, harg.len_in, harg.behavior); + return ret; +} + static long gmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { long ret = 0; @@ -69,6 +90,9 @@ static long gmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case GMEM_GET_HNUMA_ID: ret = gmem_get_hnid(arg); break; + case GMEM_MADVISE: + ret = gmem_hmadvise(arg); + break; default: pr_err("invalid cmd '%#x'.\n", cmd); return -EINVAL; diff --git a/include/linux/gmem.h b/include/linux/gmem.h index fb1219394f79..128d9c4d88fd 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -343,6 +343,8 @@ extern gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode, boo extern gm_va_t gm_as_alloc(gm_as_t *as, gm_va_t hint, gm_va_t size, gm_va_t align, gm_va_t no_cross, gm_va_t max_va, gm_region_t **new_region); +extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); + enum gmem_stat_item { NR_PAGE_MIGRATING, NR_GMEM_STAT_ITEMS diff --git a/include/linux/mm.h b/include/linux/mm.h index f2b83099948c..1d25572b6964 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3823,8 +3823,17 @@ static inline bool gmem_is_enabled(void) { return static_branch_likely(&gmem_status); } + +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} #else static inline bool gmem_is_enabled(void) { return false; } +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} #endif #endif /* _LINUX_MM_H */ diff --git a/include/uapi/linux/gmem_dev.h b/include/uapi/linux/gmem_dev.h index f7fc5ca57e8c..015792660a62 100644 --- a/include/uapi/linux/gmem_dev.h +++ b/include/uapi/linux/gmem_dev.h @@ -2,16 +2,26 @@ #ifndef _UAPI_LINUX_GMEM_DEV_H #define _UAPI_LINUX_GMEM_DEV_H +#include #include #define GMEM_MAGIC 0x55 #define _GMEM_GET_HNUMA_ID 1 +#define _GMEM_MADVISE 2 struct gmem_hnid_arg { int *hnuma_id; }; +struct hmadvise_arg { + int hnid; + unsigned long start; + __kernel_size_t len_in; + int behavior; +}; + #define GMEM_GET_HNUMA_ID _IOW(GMEM_MAGIC, _GMEM_GET_HNUMA_ID, struct gmem_hnid_arg) +#define GMEM_MADVISE _IOW(GMEM_MAGIC, _GMEM_MADVISE, struct hmadvise_arg) #endif diff --git a/init/main.c b/init/main.c index 57e4a74652a7..0b1add6bf4d0 100644 --- a/init/main.c +++ b/init/main.c @@ -102,6 +102,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -908,6 +912,10 @@ asmlinkage __visible void __init __no_sanitize_address __noreturn start_kernel(v smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); +#ifdef CONFIG_GMEM + hnuma_init(); +#endif + pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); diff --git a/mm/gmem.c b/mm/gmem.c index 8e8b1561f59f..d490e84291af 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -572,3 +572,192 @@ void hnode_deinit(unsigned int hnid, gm_dev_t *dev) xa_destroy(&hnodes[hnid]->pages); hnodes[hnid] = NULL; } + +struct prefetch_data { + struct mm_struct *mm; + gm_dev_t *dev; + unsigned long addr; + size_t size; + struct work_struct work; + int *res; +}; + +static void prefetch_work_cb(struct work_struct *work) +{ + struct prefetch_data *d = + container_of(work, struct prefetch_data, work); + unsigned long addr = d->addr, end = d->addr + d->size; + int page_size = HPAGE_SIZE; + int ret; + + do { + /* MADV_WILLNEED: dev will soon access this addr. */ + ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); + if (ret == GM_RET_PAGE_EXIST) { + pr_info("%s: device has done page fault, ignore prefetch\n", __func__); + } else if (ret != GM_RET_SUCCESS) { + *d->res = -EFAULT; + pr_err("%s: call dev fault error %d\n", __func__, ret); + } + } while (addr += page_size, addr != end); + + kfree(d); +} + +static int hmadvise_do_prefetch(gm_dev_t *dev, unsigned long addr, size_t size) +{ + unsigned long start, end, per_size; + int page_size = HPAGE_SIZE; + struct prefetch_data *data; + struct vm_area_struct *vma; + int res = GM_RET_SUCCESS; + + /* Align addr by rounding outward to make page cover addr. */ + end = round_up(addr + size, page_size); + start = round_down(addr, page_size); + size = end - start; + + mmap_read_lock(current->mm); + vma = find_vma(current->mm, start); + if (!vma || start < vma->vm_start || end > vma->vm_end) { + mmap_read_unlock(current->mm); + return GM_RET_FAILURE_UNKNOWN; + } + mmap_read_unlock(current->mm); + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (start < end) { + data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL); + if (!data) { + flush_workqueue(prefetch_wq); + return GM_RET_NOMEM; + } + + INIT_WORK(&data->work, prefetch_work_cb); + data->mm = current->mm; + data->dev = dev; + data->addr = start; + data->res = &res; + if (per_size == 0) + data->size = size; + else + /* Process (1.x * per_size) for the last time */ + data->size = (end - start < 2 * per_size) ? (end - start) : per_size; + queue_work(prefetch_wq, &data->work); + start += data->size; + } + + flush_workqueue(prefetch_wq); + return res; +} + +static int hmadvise_do_eagerfree(unsigned long addr, size_t size) +{ + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + unsigned long start, end; + gm_mapping_t *gm_mapping; + struct gm_fault_t gmf = { + .mm = current->mm, + .size = page_size, + .copy = false, + }; + vm_object_t *obj; + + /* Align addr by rounding inward to avoid excessive page release. */ + end = round_down(addr + size, page_size); + start = round_up(addr, page_size); + if (start >= end) + return ret; + + mmap_read_lock(current->mm); + do { + vma = find_vma(current->mm, start); + if (!vma || !vma_is_peer_shared(vma)) { + pr_err("gmem: not peer-shared vma, skip dontneed\n"); + continue; + } + obj = vma->vm_obj; + if (!obj) { + pr_err("gmem: peer-shared vma should have vm_object\n"); + mmap_read_unlock(current->mm); + return -EINVAL; + } + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, start); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } else if (gm_mapping_cpu(gm_mapping)) { + zap_page_range_single(vma, start, page_size, NULL); + } else { + gmf.va = start; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret) { + pr_err("gmem: peer_unmap failed. ret %d\n", ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + } + set_gm_mapping_nomap(gm_mapping); + mutex_unlock(&gm_mapping->lock); + } while (start += page_size, start != end); + + mmap_read_unlock(current->mm); + return ret; +} + +static bool check_hmadvise_behavior(int behavior) +{ + return behavior == MADV_DONTNEED; +} + +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) +{ + int error = -EINVAL; + struct hnode *node; + + if (hnid == -1) { + if (check_hmadvise_behavior(behavior)) { + goto no_hnid; + } else { + pr_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); + return error; + } + } + + if (hnid < 0) + return error; + + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) + return error; + + node = get_hnode(hnid); + if (!node) { + pr_err("hmadvise: hnode id %d is invalid\n", hnid); + return error; + } + +no_hnid: + switch (behavior) { + case MADV_PREFETCH: + return hmadvise_do_prefetch(node->dev, start, len_in); + case MADV_DONTNEED: + return hmadvise_do_eagerfree(start, len_in); + default: + pr_err("hmadvise: unsupported behavior %d\n", behavior); + } + + return error; +} +EXPORT_SYMBOL_GPL(hmadvise_inner); -- Gitee From 848492f233ce480760372e351ba60a7ffe5b2f43 Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Mon, 28 Aug 2023 10:30:11 +0800 Subject: [PATCH 11/11] mm: gmem: Introduce vm_object for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7WLVX --------------------------------------------- Introduce vm_object for gmem. Co-developed-by: fangchuangchuang Co-developed-by: Lemmy Huang Signed-off-by: Liu Chao --- include/linux/mm.h | 13 +- include/linux/vm_object.h | 27 ++- include/uapi/asm-generic/mman-common.h | 2 + kernel/fork.c | 11 + mm/Makefile | 2 +- mm/huge_memory.c | 98 ++++++++- mm/memory.c | 65 +++++- mm/mempolicy.c | 4 + mm/mmap.c | 280 ++++++++++++++++++++++++- mm/vm_object.c | 228 ++++++++++++++++++++ 10 files changed, 710 insertions(+), 20 deletions(-) create mode 100644 mm/vm_object.c diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d25572b6964..50f04282efcb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -324,6 +324,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#ifdef CONFIG_GMEM +#define VM_PEER_SHARED BIT(56) +#else +#define VM_PEER_SHARED VM_NONE +#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -3127,6 +3132,9 @@ unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_aligned(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags, unsigned long align); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, @@ -3826,7 +3834,10 @@ static inline bool gmem_is_enabled(void) static inline bool vma_is_peer_shared(struct vm_area_struct *vma) { - return false; + if (!gmem_is_enabled()) + return false; + + return !!(vma->vm_flags & VM_PEER_SHARED); } #else static inline bool gmem_is_enabled(void) { return false; } diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index 0f691ce80028..10bb7317803c 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -2,12 +2,33 @@ #ifndef _VM_OBJECT_H #define _VM_OBJECT_H +#include +#include + #ifdef CONFIG_GMEM +/* vm_object KPI */ +int __init vm_object_init(void); +vm_object_t *vm_object_create(struct vm_area_struct *vma); +void vm_object_drop_locked(struct vm_area_struct *vma); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end); -static inline int __init vm_object_init(void) { return 0; } -static inline struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) { return NULL; } -static inline int vm_object_mapping_create(vm_object_t *obj, gm_va_t start) { return 0; } +gm_mapping_t *alloc_gm_mapping(void); +struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va); +void vm_object_mapping_create(vm_object_t *obj, gm_va_t start); +void free_gm_mappings(struct vm_area_struct *vma); +#else +static inline void __init vm_object_init(void) {} +static inline vm_object_t *vm_object_create(struct vm_area_struct *vma) { return NULL; } +static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end) {} +static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; } +static inline struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) { return NULL; } +static inline void vm_object_mapping_create(vm_object_t *obj, gm_va_t start) {} +static inline void free_gm_mappings(struct vm_area_struct *vma) {} #endif #endif /* _VM_OBJECT_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 029c717e6755..9f6ee16d1884 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,6 +33,8 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ +#define MAP_PEER_SHARED 0x8000000 + /* * Flags for mlock */ diff --git a/kernel/fork.c b/kernel/fork.c index a721784458d9..7e11bdaef257 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -521,6 +525,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(orig)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + new->vm_obj = vm_object_create(new); + } +#endif + return new; } diff --git a/mm/Makefile b/mm/Makefile index c1c7b88f3184..0824907eab98 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -40,7 +40,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o -mmu-$(CONFIG_GMEM) += gmem.o +mmu-$(CONFIG_GMEM) += gmem.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 624671aaa60d..a55c88ba305d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,6 +37,9 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif #include #include @@ -656,6 +659,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; +#ifdef CONFIG_GMEM + gm_mapping_t *gm_mapping = NULL; + + if (vma_is_peer_shared(vma)) + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); +#endif VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); @@ -663,7 +672,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; } folio_throttle_swaprate(folio, gfp); @@ -673,7 +683,16 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, goto release; } +#ifdef CONFIG_GMEM + /* + * gmem device overcommit needs to reload the swapped page, + * so skip it to avoid clearing device data. + */ + if (!vma_is_peer_shared(vma) || !gm_mapping_cpu(gm_mapping)) + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); +#else clear_huge_page(page, vmf->address, HPAGE_PMD_NR); +#endif /* * The memory barrier inside __folio_mark_uptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -698,7 +717,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + goto gm_mapping_release; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -706,6 +725,14 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_add_new_anon_rmap(folio, vma, haddr); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && gm_mapping_device(gm_mapping)) { + vmf->page = page; + ret = gm_host_fault_locked(vmf, PE_SIZE_PMD); + if (ret) + goto unlock_release; + } +#endif set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -713,6 +740,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + set_gm_mapping_host(gm_mapping, page); + mutex_unlock(&gm_mapping->lock); + } +#endif } return 0; @@ -722,6 +755,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, if (pgtable) pte_free(vma->vm_mm, pgtable); folio_put(folio); +gm_mapping_release: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + mutex_unlock(&gm_mapping->lock); +#endif return ret; } @@ -780,17 +818,41 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; - struct folio *folio; + struct folio *folio = NULL; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret = 0; +#ifdef CONFIG_GMEM + gm_mapping_t *gm_mapping; + + if (vma_is_peer_shared(vma)) { + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (unlikely(!pmd_none(*vmf->pmd))) { + mutex_unlock(&gm_mapping->lock); + goto gm_mapping_release; + } + } +#endif - if (!transhuge_vma_suitable(vma, haddr)) - return VM_FAULT_FALLBACK; - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + if (!transhuge_vma_suitable(vma, haddr)) { + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; + } + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto gm_mapping_release; + } khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && + !vma_is_peer_shared(vma) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; struct page *zero_page; @@ -829,12 +891,32 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = vma_thp_gfp_mask(vma); + +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + if (vma_is_peer_shared(vma)) + gfp = GFP_TRANSHUGE; + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + } +#else folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); +#endif + if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); + +gm_mapping_release: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + mutex_unlock(&gm_mapping->lock); +#endif + return ret; } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index f69fbc251198..ed759cf1250b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,9 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif #include @@ -1522,6 +1525,47 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; } +#ifdef CONFIG_GMEM +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + gm_mapping_t *gm_mapping = NULL; + struct page *page = NULL; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + + if (gm_mapping && gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (page && (page_ref_count(page) != 0)) { + put_page(page); + gm_mapping->page = NULL; + } + } + xa_unlock(vma->vm_obj->logical_page_table); +} + +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + zap_logic_pmd_range(vma, addr, next); + } while (addr = next, addr != end); +} +#else +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +#endif + static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, @@ -1558,8 +1602,12 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pmd_range(vma, addr, next); goto next; + } + next = zap_pte_range(tlb, vma, pmd, addr, next, details); next: cond_resched(); @@ -1587,8 +1635,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, goto next; /* fall through */ } - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); @@ -1608,8 +1659,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none_or_clear_bad(p4d)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); @@ -1629,8 +1683,11 @@ void unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 40985c9d92d0..5ed13fe2bd75 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1716,7 +1716,11 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) +#else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) +#endif return false; /* diff --git a/mm/mmap.c b/mm/mmap.c index d600404580b2..061cc7381233 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -48,6 +48,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -647,6 +651,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(dst)) + dup_vm_object(dst, src); +#endif dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } @@ -754,6 +762,41 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_GMEM +struct gmem_vma_list { + struct vm_area_struct *vma; + struct list_head list; +}; + +void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head) +{ + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + if (!node) { + pr_err("%s: fail to alloc memory\n", __func__); + return; + } + + node->vma = value; + list_add_tail(&node->list, head); +} + +void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + struct vm_area_struct *vma = node->vma; + + if (vma != NULL) + vm_area_free(vma); + + list_del(&node->list); + kfree(node); + } +} +#endif + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates @@ -1041,6 +1084,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(adjust)) + vm_object_adjust(adjust, adjust->vm_start + adj_start, + adjust->vm_end); +#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -1267,7 +1315,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + len = round_up(len, SZ_2M); + addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, + SZ_2M); + } else { + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } +#else addr = get_unmapped_area(file, addr, len, pgoff, flags); +#endif if (IS_ERR_VALUE(addr)) return addr; @@ -1391,6 +1449,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) + vm_flags |= VM_PEER_SHARED; +#endif + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1827,6 +1890,27 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +EXPORT_SYMBOL(get_unmapped_area_aligned); + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2268,6 +2352,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_mpol; +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + dup_vm_object(new, vma); +#endif + if (new->vm_file) get_file(new->vm_file); @@ -2279,6 +2368,18 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + if (new_below) { + vm_object_adjust(new, new->vm_start, addr); + vm_object_adjust(vma, addr, vma->vm_end); + } else { + vm_object_adjust(vma, vma->vm_start, addr); + vm_object_adjust(new, addr, new->vm_end); + } + } +#endif + if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; @@ -2318,6 +2419,72 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +#ifdef CONFIG_GMEM +static void munmap_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + unsigned long addr = start; + vm_object_t *obj = vma->vm_obj; + gm_ret_t ret; + gm_context_t *ctx, *tmp; + gm_mapping_t *gm_mapping; + + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + if (!obj) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) { + pr_err("%s: call dev peer_unmap error %d\n", __func__, ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + if (!mm->gm_as) + return; + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", + start, end - start, ret); + } +} +#endif + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2401,6 +2568,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } next = vma_next(vmi); + +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_in_peer_devices(mm, vma, start, end); +#endif + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2509,6 +2682,18 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; + if (gmem_is_enabled()) { + vma = find_vma(mm, start); + if (!vma) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + + len = round_up(len, SZ_2M); + } + } + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2541,6 +2726,57 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } +#ifdef CONFIG_GMEM +static int alloc_va_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, unsigned long len, + vm_flags_t vm_flags) +{ + gm_context_t *ctx, *tmp; + gm_prot_t prot = VM_NONE; + gm_ret_t ret; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .size = len, + .prot = prot, + }; + + pr_debug("gmem: start mmap, as %p\n", mm->gm_as); + if (!mm->gm_as) + return -ENODEV; + + prot |= vm_flags; + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + /* + * TODO: consider the concurrency problem of device + * attaching/detaching from the gm_as. + */ + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + gmf.dev = ctx->dev; + + pr_debug("gmem: call vma_alloc\n"); + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + return ret; + } + } + + return GM_RET_SUCCESS; +} +#endif + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) @@ -2555,6 +2791,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); + +retry: +#endif /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -2567,21 +2809,33 @@ unsigned long mmap_region(struct file *file, unsigned long addr, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) + (len >> PAGE_SHIFT) - nr_pages)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) + if (security_vm_enough_memory_mm(mm, charged)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } vm_flags |= VM_ACCOUNT; } @@ -2736,6 +2990,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr, file = vma->vm_file; ksm_add_vma(vma); expanded: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + + if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + addr = get_unmapped_area(file, addr, len, pgoff, 0); + gmem_reserve_vma(vma, &reserve_list); + goto retry; + } else if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + error = -ENOMEM; + goto free_vma; + } + gmem_release_vma(mm, &reserve_list); + } +#endif perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -2785,6 +3056,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif validate_mm(mm); return error; } diff --git a/mm/vm_object.c b/mm/vm_object.c new file mode 100644 index 000000000000..ac1a115e4ee1 --- /dev/null +++ b/mm/vm_object.c @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Logical Mapping Management + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi zhu, chao Liu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA + * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA + * gets changed: merge, split, adjust + */ +static struct kmem_cache *vm_object_cachep; +static struct kmem_cache *gm_mapping_cachep; + +/* gm_mapping will not be release dynamically */ +gm_mapping_t *alloc_gm_mapping(void) +{ + gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + + if (!gm_mapping) + return NULL; + + set_gm_mapping_nomap(gm_mapping); + mutex_init(&gm_mapping->lock); + + return gm_mapping; +} +EXPORT_SYMBOL(alloc_gm_mapping); + +static inline void release_gm_mapping(gm_mapping_t *mapping) +{ + kmem_cache_free(gm_mapping_cachep, mapping); +} + +static inline gm_mapping_t *lookup_gm_mapping(vm_object_t *obj, unsigned long pindex) +{ + return xa_load(obj->logical_page_table, pindex); +} + +int __init vm_object_init(void) +{ + vm_object_cachep = KMEM_CACHE(vm_object, 0); + if (!vm_object_cachep) + goto out; + + gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0); + if (!gm_mapping_cachep) + goto free_vm_object; + + return 0; +free_vm_object: + kmem_cache_destroy(vm_object_cachep); +out: + return -ENOMEM; +} + +/* + * Create a VM_OBJECT and attach it to a VMA + * This should be called when a VMA is created. + */ +vm_object_t *vm_object_create(struct vm_area_struct *vma) +{ + vm_object_t *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL); + + if (!obj) + return NULL; + + spin_lock_init(&obj->lock); + obj->vma = vma; + + /* + * The logical page table maps linear_page_index(obj->vma, va) + * to pointers of struct gm_mapping. + */ + obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL); + if (!obj->logical_page_table) { + kmem_cache_free(vm_object_cachep, obj); + return NULL; + } + + xa_init(obj->logical_page_table); + atomic_set(&obj->nr_pages, 0); + atomic_set(&obj->ref_count, 1); + + return obj; +} + +/* This should be called when a VMA no longer refers to a VM_OBJECT */ +void vm_object_drop_locked(struct vm_area_struct *vma) +{ + vm_object_t *obj = vma->vm_obj; + + if (!obj) { + pr_err("vm_object: vm_obj of the vma is NULL\n"); + return; + } + + /* + * We must enter this with VMA write-locked, which is unfortunately a giant lock. + * Note that Linux 6.0 has per-VMA lock: + * https://lwn.net/Articles/906852/ + * https://lwn.net/Articles/906833/ + */ + free_gm_mappings(vma); + mmap_assert_write_locked(vma->vm_mm); + vma->vm_obj = NULL; + + if (atomic_dec_and_test(&obj->ref_count)) { + xa_destroy(obj->logical_page_table); + kfree(obj->logical_page_table); + kmem_cache_free(vm_object_cachep, obj); + } +} + +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + unsigned long index; + gm_mapping_t *mapping; + unsigned long moved_pages = 0; + + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); + + xa_lock(dst->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) { + index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff + + ((src->vm_start - dst->vm_start) >> PAGE_SHIFT); + __xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &dst->vm_obj->nr_pages); + xa_unlock(dst->vm_obj->logical_page_table); +} + +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ + unsigned long removed_pages = 0; + gm_mapping_t *mapping; + + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xas_lock(&xas); + if (vma->vm_start < start) { + xas_for_each(&xas, mapping, linear_page_index(vma, start)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + + if (vma->vm_end > end) { + xas_set(&xas, linear_page_index(vma, end)); + + xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + atomic_sub(removed_pages, &vma->vm_obj->nr_pages); + xas_unlock(&xas); +} + +/* + * Given a VA, the page_index is computed by + * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address) + */ +struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) +{ + return lookup_gm_mapping(obj, linear_page_index(obj->vma, va)); +} +EXPORT_SYMBOL_GPL(vm_object_lookup); + +void vm_object_mapping_create(vm_object_t *obj, gm_va_t start) +{ + pgoff_t index = linear_page_index(obj->vma, start); + gm_mapping_t *gm_mapping; + + gm_mapping = alloc_gm_mapping(); + if (!gm_mapping) + return; + + __xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL); +} + +void free_gm_mappings(struct vm_area_struct *vma) +{ + gm_mapping_t *gm_mapping; + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) { + release_gm_mapping(gm_mapping); + xas_store(&xas, NULL); + } + xa_unlock(vma->vm_obj->logical_page_table); +} -- Gitee