diff --git a/drivers/base/node.c b/drivers/base/node.c index b46db17124f346aff070bf839b606046720b9854..4943a25cc272e8a47d102f468a0437c6bdf6f19f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -922,6 +922,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), +#endif }; static struct attribute *node_state_attrs[] = { @@ -934,6 +937,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_GMEM + &node_state_attr[N_HETEROGENEOUS].attr.attr, +#endif NULL }; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 801d6c83f896163426f073400937b2de4e290215..f485063c5818aebb6c7d4272acd4ee97b9d3f536 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -421,4 +421,11 @@ config ADI and SSM (Silicon Secured Memory). Intended consumers of this driver include crash and makedumpfile. +config GMEM_DEV + tristate "driver for gmem" + depends on GMEM + default m + help + driver for gmem in order to pass ioctl commands. + endmenu diff --git a/drivers/char/Makefile b/drivers/char/Makefile index c5f532e412f1a4b93100ad51e5662563d7f3ab25..4d01ef364c8b3037244a3df8e87a9a5c590ba95a 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -44,3 +44,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_XILLYBUS_CLASS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o +obj-$(CONFIG_GMEM_DEV) += gmem_dev.o diff --git a/drivers/char/gmem_dev.c b/drivers/char/gmem_dev.c new file mode 100644 index 0000000000000000000000000000000000000000..225ed506a9c095ac3daeaa5ce92e8ee304a61d13 --- /dev/null +++ b/drivers/char/gmem_dev.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static int gmem_get_hnid(unsigned long arg) +{ + void __user *buf = (void __user *)arg; + struct gmem_hnid_arg gmem_hnid; + gm_context_t *ctx, *tmp; + gm_dev_t *gm_dev = NULL; + gm_as_t *as = NULL; + int hnuma_id; + + if (!access_ok(buf, sizeof(struct gmem_hnid_arg))) { + pr_err("access_ok failed\n"); + return -EFAULT; + } + + if (copy_from_user(&gmem_hnid, buf, sizeof(struct gmem_hnid_arg))) { + pr_err("copy_from_user failed.\n"); + return -EFAULT; + } + + if (!current->mm) { + pr_err("current's mm is null.\n"); + return -EFAULT; + } + + as = current->mm->gm_as; + if (!as) { + pr_err("current isn't gmem task failed.\n"); + return -ENODEV; + } + + list_for_each_entry_safe(ctx, tmp, &as->gm_ctx_list, gm_as_link) { + gm_dev = ctx->dev; + if (gm_dev) + break; + } + + if (!gm_dev) { + pr_err("gmem_id_to_device failed.\n"); + return -ENODEV; + } + + hnuma_id = first_node(gm_dev->registered_hnodes); + if (copy_to_user(gmem_hnid.hnuma_id, &hnuma_id, sizeof(int))) { + pr_err("copy_to_user failed.\n"); + return -EFAULT; + } + + return 0; +} + +static int gmem_hmadvise(unsigned long arg) +{ + struct hmadvise_arg harg; + void __user *buf; + int ret; + + buf = (void __user *)arg; + if (!access_ok(buf, sizeof(struct hmadvise_arg))) { + pr_err("access_ok failed.\n"); + return -EFAULT; + } + + if (copy_from_user(&harg, buf, sizeof(struct hmadvise_arg))) { + pr_err("copy_from_user failed.\n"); + return -EFAULT; + } + + ret = hmadvise_inner(harg.hnid, harg.start, harg.len_in, harg.behavior); + return ret; +} + +static long gmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret = 0; + + if (_IOC_TYPE(cmd) != GMEM_MAGIC) { + pr_err("invalid cmd magic number '%#x', should '%#x'.\n", + _IOC_TYPE(cmd), GMEM_MAGIC); + return -EINVAL; + } + + switch (cmd) { + case GMEM_GET_HNUMA_ID: + ret = gmem_get_hnid(arg); + break; + case GMEM_MADVISE: + ret = gmem_hmadvise(arg); + break; + default: + pr_err("invalid cmd '%#x'.\n", cmd); + return -EINVAL; + } + + return ret; +} + +static const struct file_operations gmem_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = gmem_ioctl, + .compat_ioctl = gmem_ioctl, +}; + +static struct miscdevice gmem_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "gmem", + .fops = &gmem_fops, +}; + +builtin_misc_device(gmem_miscdev); diff --git a/include/linux/device.h b/include/linux/device.h index 472dd24d4823a6f09757ff505b6df7da1c60a5b7..6a4c901119c896ddbefbf909de25d183d594d658 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -32,6 +32,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + struct device; struct device_private; struct device_driver; @@ -655,6 +659,9 @@ struct device { #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif +#ifdef CONFIG_GMEM + gm_dev_t *gm_dev; +#endif }; /** diff --git a/include/linux/gmem.h b/include/linux/gmem.h new file mode 100644 index 0000000000000000000000000000000000000000..128d9c4d88fd713e52ea7de95f60dee8624151fe --- /dev/null +++ b/include/linux/gmem.h @@ -0,0 +1,389 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ +#ifndef _GMEM_H +#define _GMEM_H + +#include + +typedef unsigned long gm_region_placement_t; +typedef unsigned long gm_prot_t; +typedef enum gm_ret gm_ret_t; +typedef struct gm_region gm_region_t; +typedef struct gm_mapping_set gm_mapping_set_t; +typedef enum gm_mmu_mode gm_mmu_mode_t; +typedef struct gm_mmu gm_mmu_t; +typedef unsigned long gm_dev_cap_t; +typedef struct gm_context gm_context_t; +typedef struct gm_dev gm_dev_t; +typedef struct gm_mapping gm_mapping_t; + +struct hnode; + +/* + * enum gm_ret - The return value of GMEM KPI that can be used to tell + * the core VM or peripheral driver whether the GMEM KPI was + * executed successfully. + * + * @GM_RET_SUCCESS: The invoked GMEM KPI behaved as expected. + * @GM_RET_FAILURE_UNKNOWN: The GMEM KPI failed with unknown reason. + * Any external status related to this KPI invocation changes must be rolled back. + */ +enum gm_ret { + GM_RET_SUCCESS = 0, + GM_RET_NOMEM, + GM_RET_PAGE_EXIST, + GM_RET_DMA_ERROR, + GM_RET_MIGRATING, + GM_RET_FAILURE_UNKNOWN, + GM_RET_UNIMPLEMENTED, +}; + +/* + * Defines a contiguous range of virtual addresses inside a gm_as_t + * As an analogy, this is conceptually similar as virtual_address_struct + */ +struct gm_region { + gm_va_t start_va; + gm_va_t end_va; + struct rb_node node; + gm_as_t *as; /* The address space that it belongs to */ + + /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ + struct list_head mapping_set_link; + + void (*callback_op)(void *args); + void *cb_args; +}; + +/* This holds a list of regions that must not be concurrently manipulated. */ +struct gm_mapping_set { + unsigned int region_cnt; + struct list_head gm_region_list; +}; + +/** + * enum gm_mmu_mode - defines the method to share a physical page table. + * + * @GM_MMU_MODE_SHARE: Literally share a physical page table with another + * attached device's MMU. Nothing is guaranteed about the allocated address. + * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds + * exclusive mapping entries, so that device memory accesses can trigger fault-driven + * migration for automatic data locality optimizations. + * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical + * mapping entries whenever a physical mapping is installed inside the address space, so + * that it may minimize the page faults to be triggered by this device. + */ +enum gm_mmu_mode { + GM_MMU_MODE_SHARE, + GM_MMU_MODE_COHERENT_EXCLUSIVE, + GM_MMU_MODE_REPLICATE, +}; + +/* + * This is the parameter list of peer_map/unmap mmu operations. + * if device should copy data to/from host, set copy and dma_addr + */ +struct gm_fault_t { + struct mm_struct *mm; + gm_dev_t *dev; + gm_va_t va; + gm_pa_t size; + gm_prot_t prot; + bool copy; + dma_addr_t dma_addr; + int behavior; +}; + +struct gm_memcpy_t { + struct mm_struct *mm; + gm_dev_t *dev; + gm_va_t src; + gm_va_t dest; + dma_addr_t dma_addr; + size_t size; +}; + +/** + * + * This struct defines a series of MMU functions registered by a peripheral + * device that is to be invoked by GMEM. + * + * pmap is an opaque pointer that identifies a physical page table of a device. + * A physical page table holds the physical mappings that can be interpreted by + * the hardware MMU. + */ +struct gm_mmu { + /* + * Each bit indicates a supported page size for page-based TLB. + * Currently we do not consider range TLBs. + */ + unsigned long pgsize_bitmap; + + /* + * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, + * then it means their page table formats are compatible. + * In that case, they can share the same void *pmap as the input arg. + */ + unsigned long cookie; + + /* Synchronize VMA in a peer OS to interact with the host OS */ + gm_ret_t (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); + gm_ret_t (*peer_va_free)(struct gm_fault_t *gmf); + + /* Create physical mappings on peer host. + * If copy is set, copy data [dma_addr, dma_addr + size] to peer host + */ + gm_ret_t (*peer_map)(struct gm_fault_t *gmf); + /* + * Destroy physical mappings on peer host. + * If copy is set, copy data back to [dma_addr, dma_addr + size] + */ + gm_ret_t (*peer_unmap)(struct gm_fault_t *gmf); + + /* Create or destroy a device's physical page table. */ + gm_ret_t (*pmap_create)(gm_dev_t *dev, void **pmap); + gm_ret_t (*pmap_destroy)(void *pmap); + + /* Create or destroy a physical mapping of a created physical page table */ + gm_ret_t (*pmap_enter)(void *pmap, gm_va_t va, gm_va_t size, + gm_pa_t pa, gm_prot_t prot); + gm_ret_t (*pmap_release)(void *pmap, gm_va_t va, gm_va_t size); + + /* Change the protection of a virtual page */ + gm_ret_t (*pmap_protect)(void *pmap, gm_va_t va, gm_va_t size, gm_prot_t new_prot); + + /* Invalidation functions of the MMU TLB */ + gm_ret_t (*tlb_invl)(void *pmap, gm_va_t va, gm_va_t size); + gm_ret_t (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); +}; + +/** + * gm_dev_cap_t defines a composable flag to describe the capabilities of a device. + * + * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. + * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS + */ +#define GM_DEV_CAP_REPLAYABLE 0x00000001 +#define GM_DEV_CAP_PEER 0x00000010 + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +struct gm_context { + gm_as_t *as; + gm_dev_t *dev; + void *pmap; + /* + * consider a better container to maintain multiple ctx inside a device or multiple ctx + * inside a va space. + * A device may simultaneously have multiple contexts for time-sliced ctx switching + */ + struct list_head gm_dev_link; + + /* A va space may have multiple gm_context */ + struct list_head gm_as_link; +}; +#define get_gm_context(head) (list_entry((head)->prev, gm_context_t, ctx_link)) + +struct gm_dev { + int id; + + /* identifies the device capability + * For example, whether the device supports page faults or whether it has its + * own OS that manages the VA and PA resources. + */ + gm_dev_cap_t capability; + gm_mmu_t *mmu; + void *dev_data; + /* + * TODO: Use a better container of gm_context_t to support time-sliced context switch. + * A collection of device contexts. If the device does not support time-sliced context + * switch, then the size of the collection should never be greater than one. + * We need to think about what operators should the container be optimized for. + * A list, a radix-tree or what? What would gm_dev_activate require? + * Are there any accelerators that are really going to support time-sliced context switch? + */ + gm_context_t *current_ctx; + + struct list_head gm_ctx_list; + + /* Add tracking of registered device local physical memory. */ + nodemask_t registered_hnodes; + struct device *dma_dev; + + gm_mapping_t *gm_mapping; +}; + +#define HOST_NODE_ID (-1) + +#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ +#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_PAGE_DEVICE 0x20 +#define GM_PAGE_NOMAP 0x40 +#define GM_PAGE_PINNED 0x80 +#define GM_PAGE_WILLNEED 0x100 + +#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) + +/* Records the status of a page-size physical page */ +struct gm_mapping { + /* + * The node index may have three definitions: + * 1. a common CPU node + * 2. a hetero-node, e.g. GPU (that not necessarily supports CC ld/st) + * 3. a network ip (another OS that may have multiple hNUMA nodes), dynamically attached by dsm_attach + * Among these definitions, #1 and #2 in combination defines an h-NUMA topology + */ + unsigned int node_id; + + unsigned int flag; + + union { + struct page *page; /* CPU node */ + gm_dev_t *dev; /* hetero-node */ + gm_pa_t pfn; + }; + + struct mutex lock; +}; + +static inline bool gm_mapping_cpu(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_CPU); +} + +static inline void set_gm_mapping_host(gm_mapping_t *gm_mapping, struct page *page) +{ + gm_mapping->node_id = HOST_NODE_ID; + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + gm_mapping->flag |= GM_PAGE_CPU; + gm_mapping->page = page; +} + +static inline bool gm_mapping_device(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_DEVICE); +} + +static inline void set_gm_mapping_device(gm_mapping_t *gm_mapping, gm_dev_t *dev) +{ + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + gm_mapping->flag |= GM_PAGE_DEVICE; + gm_mapping->dev = dev; +} + +static inline bool gm_mapping_nomap(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_NOMAP); +} + +static inline void set_gm_mapping_nomap(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + gm_mapping->flag |= GM_PAGE_NOMAP; + gm_mapping->page = NULL; +} + +static inline void set_gm_mapping_willneed(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag |= GM_PAGE_WILLNEED; +} + +static inline void clear_gm_mapping_willneed(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag &= ~GM_PAGE_WILLNEED; +} + +static inline bool gm_mapping_willneed(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_WILLNEED); +} + +static inline void set_gm_mapping_pinned(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag |= GM_PAGE_PINNED; +} + +static inline void clear_gm_mapping_pinned(gm_mapping_t *gm_mapping) +{ + gm_mapping->flag &= ~GM_PAGE_PINNED; +} + +static inline bool gm_mapping_pinned(gm_mapping_t *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_PINNED); +} + +#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } + +/* GMEM Device KPI */ +extern gm_ret_t gm_dev_create(gm_mmu_t *mmu, void *dev_data, gm_dev_cap_t cap, gm_dev_t **new_dev); +extern gm_ret_t gm_dev_destroy(gm_dev_t *dev); +extern gm_ret_t gm_dev_switch(gm_dev_t *dev, gm_as_t *as); +extern gm_ret_t gm_dev_detach(gm_dev_t *dev, gm_as_t *as); +extern gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end); +gm_ret_t gm_dev_fault(struct mm_struct *mm, gm_va_t addr, gm_dev_t *dev, int behavior); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size); + +/* GMEM address space KPI */ +extern gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end); +extern void gm_dev_unregister_physmem(gm_dev_t *dev, unsigned int nid); +extern gm_mapping_t *gm_mappings_alloc(unsigned int nid, unsigned int order); +extern void gm_mappings_free(gm_mapping_t *mapping, unsigned int order); +extern gm_ret_t gm_as_create(gm_va_t begin, gm_va_t end, gm_as_alloc_t policy, gm_va_t cache_quantum, gm_as_t **new_as); +extern gm_ret_t gm_as_destroy(gm_as_t *as); +extern gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode, bool activate, gm_context_t **out_ctx); +extern gm_va_t gm_as_alloc(gm_as_t *as, gm_va_t hint, gm_va_t size, gm_va_t align, gm_va_t no_cross, + gm_va_t max_va, gm_region_t **new_region); + +extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); + +enum gmem_stat_item { + NR_PAGE_MIGRATING, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_state_counter(enum gmem_stat_item item, int val); +extern void gmem_state_counter_show(void); + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + + gm_dev_t *dev; + + struct xarray pages; +}; + +extern struct hnode *hnodes[]; + +static inline bool is_hnode(int node) +{ + return !node_isset(node, node_possible_map) + && node_isset(node, hnode_map); +} + +static inline bool is_hnode_allowed(int node) +{ + return is_hnode(node) && node_isset(node, current->mems_allowed); +} + +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +void __init hnuma_init(void); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, gm_dev_t *dev); +void hnode_deinit(unsigned int hnid, gm_dev_t *dev); + +#endif /* _GMEM_H */ diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h new file mode 100644 index 0000000000000000000000000000000000000000..f62dac65e2edca1e37c96e407b60e876f2c9d8b8 --- /dev/null +++ b/include/linux/gmem_as.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _GMEM_AS_H +#define _GMEM_AS_H + +typedef struct gm_as gm_as_t; +typedef unsigned long gm_va_t; +typedef unsigned long gm_pa_t; +typedef enum gm_as_alloc gm_as_alloc_t; + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of gm_as_t */ + struct rb_root rbroot; /*root of gm_region_t */ + gm_as_alloc_t policy; + gm_va_t start_va; + gm_va_t end_va; + gm_va_t cache_quantum; /* defines the VA unit size if an object cache is applied */ + + struct list_head gm_ctx_list; /* tracks device contexts attached to this va space, using gm_as_link */ +}; + +#endif diff --git a/include/linux/gmem_dev.h b/include/linux/gmem_dev.h new file mode 100644 index 0000000000000000000000000000000000000000..b1359f19b85070ab3005a8740b736ebf3207ee09 --- /dev/null +++ b/include/linux/gmem_dev.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __GMEM_DEV_H +#define __GMEM_DEV_H + +#include + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 27ce77080c79c7a026e641e491246fcf6f7e26c0..50f04282efcb1ecb76cf76e43031ee33310dff67 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -324,6 +324,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#ifdef CONFIG_GMEM +#define VM_PEER_SHARED BIT(56) +#else +#define VM_PEER_SHARED VM_NONE +#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -3127,6 +3132,9 @@ unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_aligned(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags, unsigned long align); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, @@ -3816,4 +3824,27 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif +#ifdef CONFIG_GMEM +DECLARE_STATIC_KEY_FALSE(gmem_status); + +static inline bool gmem_is_enabled(void) +{ + return static_branch_likely(&gmem_status); +} + +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + if (!gmem_is_enabled()) + return false; + + return !!(vma->vm_flags & VM_PEER_SHARED); +} +#else +static inline bool gmem_is_enabled(void) { return false; } +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 306a3d1a0fa655af73e3b3f37446dea60e4fe902..cac73ccf7367a3e7fd876b3fc0d640ec981a1ad7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #ifndef AT_VECTOR_SIZE_ARCH @@ -465,6 +469,44 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +#ifdef CONFIG_GMEM +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; +typedef struct vm_object vm_object_t; +#endif + struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ @@ -571,6 +613,9 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_GMEM + struct vm_object *vm_obj; +#endif } __randomize_layout; #ifdef CONFIG_SCHED_MM_CID @@ -802,6 +847,9 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN */ +#ifdef CONFIG_GMEM + gm_as_t *gm_as; +#endif } __randomize_layout; /* diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bb0ee80526b2dc30023cdda7dc6a8022f81727a4..39820123d7934456c2ad0fee44d13a607b6dd23b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,6 +407,9 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_GMEM + N_HETEROGENEOUS, /* The node has heterogeneous memory */ +#endif NR_NODE_STATES }; @@ -536,6 +539,13 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#ifdef CONFIG_GMEM +/* For h-NUMA topology */ +#define hnode_map node_states[N_HETEROGENEOUS] +#define num_hnodes() num_node_state(N_HETEROGENEOUS) +#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) +#endif + /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h new file mode 100644 index 0000000000000000000000000000000000000000..10bb7317803c0a973f50c9f0bfb014f045516180 --- /dev/null +++ b/include/linux/vm_object.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VM_OBJECT_H +#define _VM_OBJECT_H + +#include +#include + +#ifdef CONFIG_GMEM +/* vm_object KPI */ +int __init vm_object_init(void); +vm_object_t *vm_object_create(struct vm_area_struct *vma); +void vm_object_drop_locked(struct vm_area_struct *vma); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end); + +gm_mapping_t *alloc_gm_mapping(void); +struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va); +void vm_object_mapping_create(vm_object_t *obj, gm_va_t start); +void free_gm_mappings(struct vm_area_struct *vma); +#else +static inline void __init vm_object_init(void) {} +static inline vm_object_t *vm_object_create(struct vm_area_struct *vma) { return NULL; } +static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end) {} + +static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; } +static inline struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) { return NULL; } +static inline void vm_object_mapping_create(vm_object_t *obj, gm_va_t start) {} +static inline void free_gm_mappings(struct vm_area_struct *vma) {} +#endif + +#endif /* _VM_OBJECT_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432c64599f706b86e74a12581c2a54e..9f6ee16d18847cb34173cc3561e0be0d67ffc7ce 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,6 +33,8 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ +#define MAP_PEER_SHARED 0x8000000 + /* * Flags for mlock */ @@ -79,6 +81,11 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* for hmadvise */ +#define MADV_GMEM_BASE 0x1000 +#define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ +#define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/linux/gmem_dev.h b/include/uapi/linux/gmem_dev.h new file mode 100644 index 0000000000000000000000000000000000000000..015792660a62059effd7f9dc438af16b1fd1c2e8 --- /dev/null +++ b/include/uapi/linux/gmem_dev.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_GMEM_DEV_H +#define _UAPI_LINUX_GMEM_DEV_H + +#include +#include + +#define GMEM_MAGIC 0x55 + +#define _GMEM_GET_HNUMA_ID 1 +#define _GMEM_MADVISE 2 + +struct gmem_hnid_arg { + int *hnuma_id; +}; + +struct hmadvise_arg { + int hnid; + unsigned long start; + __kernel_size_t len_in; + int behavior; +}; + +#define GMEM_GET_HNUMA_ID _IOW(GMEM_MAGIC, _GMEM_GET_HNUMA_ID, struct gmem_hnid_arg) +#define GMEM_MADVISE _IOW(GMEM_MAGIC, _GMEM_MADVISE, struct hmadvise_arg) + +#endif diff --git a/init/main.c b/init/main.c index 57e4a74652a775ab124cd6a9a170600e1b893cf3..0b1add6bf4d061f987e4e2169b7bb6b5e3f00fb8 100644 --- a/init/main.c +++ b/init/main.c @@ -102,6 +102,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -908,6 +912,10 @@ asmlinkage __visible void __init __no_sanitize_address __noreturn start_kernel(v smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); +#ifdef CONFIG_GMEM + hnuma_init(); +#endif + pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); diff --git a/kernel/fork.c b/kernel/fork.c index a721784458d9c38a3eca79f6c33b087f6e4f12a5..7e11bdaef25777e8abf98994894d8b6d409bf47f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -521,6 +525,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(orig)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + new->vm_obj = vm_object_create(new); + } +#endif + return new; } diff --git a/kernel/pid.c b/kernel/pid.c index f93954a0384d3889fb6d52de977562d10be1a012..69089222d8d8d7d85e766f0ff906caf77d558b6d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -434,6 +434,7 @@ struct task_struct *find_get_task_by_vpid(pid_t nr) return task; } +EXPORT_SYMBOL_GPL(find_get_task_by_vpid); struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { diff --git a/mm/Kconfig b/mm/Kconfig index 7672a22647b4a2434c22bda7b92b3897efc84783..b950407dd87fd9d814b891bad3138967c4c33cb7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1206,6 +1206,15 @@ config PER_VMA_LOCK This feature allows locking each virtual memory area separately when handling page faults instead of taking mmap_lock. +config GMEM + bool "gmem subsystem for multi-MMU cooperative management" + depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + select GMEM_DEV + default y + help + say Y here to enable gmem subsystem + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index e29afc890cde2583124dabb8332be8f0f367313c..0824907eab98e9bc9e8f856c8e071bdd8fcba3cb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -40,6 +40,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o +mmu-$(CONFIG_GMEM) += gmem.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH diff --git a/mm/gmem.c b/mm/gmem.c new file mode 100644 index 0000000000000000000000000000000000000000..d490e84291aff2e1b3a450fd8ca392ad8f9b25c1 --- /dev/null +++ b/mm/gmem.c @@ -0,0 +1,763 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(gmem_status); +EXPORT_SYMBOL_GPL(gmem_status); + +static struct kmem_cache *gm_as_cache; +static struct kmem_cache *gm_dev_cache; +static struct kmem_cache *gm_ctx_cache; +static struct kmem_cache *gm_region_cache; +static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); + +static bool enable_gmem; + +static inline unsigned long pe_mask(enum page_entry_size pe_size) +{ + if (pe_size == PE_SIZE_PTE) + return PAGE_MASK; + if (pe_size == PE_SIZE_PMD) + return HPAGE_PMD_MASK; + if (pe_size == PE_SIZE_PUD) + return HPAGE_PUD_MASK; + return ~0; +} + +static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; + +void gmem_state_counter(enum gmem_stat_item item, int val) +{ + if (!gmem_is_enabled()) + return; + + if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS))) + return; + + percpu_counter_add(&g_gmem_stats[item], val); +} + +#ifdef CONFIG_PROC_FS +static int gmemstat_show(struct seq_file *m, void *arg) +{ + if (!gmem_is_enabled()) + return 0; + + seq_printf(m, "migrating : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING])); + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static struct workqueue_struct *prefetch_wq; + +#define GM_WORK_CONCURRENCY 4 + +static int __init gmem_init(void) +{ + int err = -ENOMEM; + + if (!enable_gmem) + return 0; + + gm_as_cache = KMEM_CACHE(gm_as, 0); + if (!gm_as_cache) + goto out; + + gm_dev_cache = KMEM_CACHE(gm_dev, 0); + if (!gm_dev_cache) + goto free_as; + + gm_ctx_cache = KMEM_CACHE(gm_context, 0); + if (!gm_ctx_cache) + goto free_dev; + + gm_region_cache = KMEM_CACHE(gm_region, 0); + if (!gm_region_cache) + goto free_ctx; + + err = vm_object_init(); + if (err) + goto free_ctx; + + prefetch_wq = alloc_workqueue("prefetch", + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); + if (!prefetch_wq) { + pr_info("fail to alloc workqueue prefetch_wq\n"); + err = -EFAULT; + goto free_ctx; + } + +#ifdef CONFIG_PROC_FS + proc_create_single("gmemstat", 0444, NULL, gmemstat_show); +#endif + + static_branch_enable(&gmem_status); + + return 0; + +free_ctx: + kmem_cache_destroy(gm_ctx_cache); +free_dev: + kmem_cache_destroy(gm_dev_cache); +free_as: + kmem_cache_destroy(gm_as_cache); +out: + return -ENOMEM; +} +subsys_initcall(gmem_init); + +static int __init setup_gmem(char *str) +{ + strtobool(str, &enable_gmem); + + return 1; +} +__setup("gmem=", setup_gmem); + +/* + * Create a GMEM device, register its MMU function and the page table. + * The returned device pointer will be passed by new_dev. + * A unique id will be assigned to the GMEM device, using Linux's xarray. + */ +gm_ret_t gm_dev_create(gm_mmu_t *mmu, void *dev_data, gm_dev_cap_t cap, gm_dev_t **new_dev) +{ + gm_dev_t *dev; + + if (!gmem_is_enabled()) + return GM_RET_FAILURE_UNKNOWN; + + dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); + if (!dev) + return GM_RET_NOMEM; + + if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, GFP_KERNEL)) { + kmem_cache_free(gm_dev_cache, dev); + return GM_RET_NOMEM; + } + + dev->capability = cap; + dev->mmu = mmu; + dev->dev_data = dev_data; + dev->current_ctx = NULL; + INIT_LIST_HEAD(&dev->gm_ctx_list); + *new_dev = dev; + nodes_clear(dev->registered_hnodes); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_create); + +/* Handle the page fault triggered by a given device */ +gm_ret_t gm_dev_fault(struct mm_struct *mm, gm_va_t addr, gm_dev_t *dev, int behavior) +{ + gm_ret_t ret = GM_RET_SUCCESS; + gm_mmu_t *mmu = dev->mmu; + struct device *dma_dev = dev->dma_dev; + struct vm_area_struct *vma; + vm_object_t *obj; + gm_mapping_t *gm_mapping; + gm_va_t size = HPAGE_SIZE; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior + }; + struct page *page = NULL; + + mmap_read_lock(mm); + + vma = find_vma(mm, addr); + if (!vma) { + pr_info("gmem: %s no vma\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + obj = vma->vm_obj; + if (!obj) { + pr_info("gmem: %s no vm_obj\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + vm_object_mapping_create(obj, addr); + gm_mapping = vm_object_lookup(obj, addr); + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + goto peer_map; + } else if (gm_mapping_device(gm_mapping)) { + if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { + goto peer_map; + } else { + ret = 0; + goto unlock; + } + } else if (gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (!page) { + pr_err("gmem: host gm_mapping page is NULL. Set nomap\n"); + set_gm_mapping_nomap(gm_mapping); + goto unlock; + } + get_page(page); + zap_page_range_single(vma, addr, size, NULL); + gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) + pr_info("gmem: dma map failed\n"); + + gmf.copy = true; + } + +peer_map: + ret = mmu->peer_map(&gmf); + if (ret != GM_RET_SUCCESS) { + if (ret == GM_RET_MIGRATING) { + /* + * gmem page is migrating due to overcommit. + * update page to willneed and this will stop page evicting + */ + set_gm_mapping_willneed(gm_mapping); + gmem_state_counter(NR_PAGE_MIGRATING, 1); + ret = GM_RET_SUCCESS; + } else { + pr_err("gmem: peer map failed\n"); + if (page) { + set_gm_mapping_nomap(gm_mapping); + put_page(page); + } + } + goto unlock; + } + + if (page) { + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + put_page(page); + } + + set_gm_mapping_device(gm_mapping, dev); +unlock: + mutex_unlock(&gm_mapping->lock); +mmap_unlock: + mmap_read_unlock(mm); + return ret; +} +EXPORT_SYMBOL_GPL(gm_dev_fault); + +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size) +{ + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & pe_mask(pe_size); + vm_object_t *obj = vma->vm_obj; + gm_mapping_t *gm_mapping; + gm_va_t size = HPAGE_SIZE; + gm_dev_t *dev; + struct device *dma_dev; + struct gm_fault_t gmf = { + .mm = vma->vm_mm, + .va = addr, + .size = size, + .copy = true, + }; + + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + pr_err("gmem: host fault gm_mapping should not be NULL\n"); + return VM_FAULT_SIGBUS; + } + + dev = gm_mapping->dev; + gmf.dev = dev; + dma_dev = dev->dma_dev; + gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + pr_err("gmem: host fault dma mapping error\n"); + return VM_FAULT_SIGBUS; + } + if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { + pr_err("gmem: peer unmap failed\n"); + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return VM_FAULT_SIGBUS; + } + + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return ret; +} + +/* + * Register the local physical memory of a gmem device. + * This implies dynamically creating + * the struct page data structures. + */ +gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end) +{ + gm_mapping_t *mapping; + gm_pa_t addr = PAGE_ALIGN(begin); + unsigned int nid; + int i, page_num = (end - addr) >> PAGE_SHIFT; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + + if (!hnode) + goto err; + + nid = alloc_hnode_id(); + if (nid == MAX_NUMNODES) + goto free_hnode; + hnode_init(hnode, nid, dev); + + mapping = kvmalloc(sizeof(gm_mapping_t) * page_num, GFP_KERNEL); + if (!mapping) + goto deinit_hnode; + + for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { + mapping[i].node_id = hnode->id; + mapping[i].pfn = addr >> PAGE_SHIFT; + mapping[i].flag = 0; + } + + xa_lock(&hnode->pages); + for (i = 0; i < page_num; i++) { + if (xa_err(__xa_store(&hnode->pages, i, mapping + i, GFP_KERNEL))) { + /* Probably nomem */ + kvfree(mapping); + xa_unlock(&hnode->pages); + goto deinit_hnode; + } + __xa_set_mark(&hnode->pages, i, XA_MARK_0); + } + xa_unlock(&hnode->pages); + + return GM_RET_SUCCESS; + +deinit_hnode: + hnode_deinit(nid, dev); + free_hnode_id(nid); +free_hnode: + kfree(hnode); +err: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gm_dev_register_physmem); + +void gm_dev_unregister_physmem(gm_dev_t *dev, unsigned int nid) +{ + struct hnode *hnode = get_hnode(nid); + gm_mapping_t *mapping = xa_load(&hnode->pages, 0); + + kvfree(mapping); + hnode_deinit(nid, dev); + free_hnode_id(nid); + kfree(hnode); +} +EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); + +gm_mapping_t *gm_mappings_alloc(unsigned int nid, unsigned int order) +{ + gm_mapping_t *mapping; + struct hnode *node = get_hnode(nid); + XA_STATE(xas, &node->pages, 0); + + /* TODO: support order > 0 */ + if (order != 0) + return ERR_PTR(-EINVAL); + + xa_lock(&node->pages); + mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0); + if (!mapping) { + xa_unlock(&node->pages); + return ERR_PTR(-ENOMEM); + } + + xas_clear_mark(&xas, XA_MARK_0); + xa_unlock(&node->pages); + + return mapping; +} +EXPORT_SYMBOL_GPL(gm_mappings_alloc); + +void gm_mappings_free(gm_mapping_t *mapping, unsigned int order) +{ + gm_mapping_t *entry; + struct hnode *node = get_hnode(mapping->node_id); + XA_STATE(xas, &node->pages, 0); + + /* TODO: support order > 0 */ + if (order != 0) + return; + + xas_for_each(&xas, entry, ULONG_MAX) { + if (entry == mapping) { + xas_set_mark(&xas, XA_MARK_0); + break; + } + } +} +EXPORT_SYMBOL_GPL(gm_mappings_free); + +/* GMEM Virtual Address Space API */ +gm_ret_t gm_as_create(gm_va_t begin, gm_va_t end, gm_as_alloc_t policy, + gm_va_t cache_quantum, gm_as_t **new_as) +{ + gm_as_t *as; + + if (!new_as) + return -EINVAL; + + as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC); + if (!as) + return -ENOMEM; + + spin_lock_init(&as->rbtree_lock); + as->rbroot = RB_ROOT; + as->start_va = begin; + as->end_va = end; + as->policy = policy; + + INIT_LIST_HEAD(&as->gm_ctx_list); + + *new_as = as; + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_create); + +gm_ret_t gm_as_destroy(gm_as_t *as) +{ + gm_context_t *ctx, *tmp_ctx; + + list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link) + kfree(ctx); + + kmem_cache_free(gm_as_cache, as); + + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_destroy); + +gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode, + bool activate, gm_context_t **out_ctx) +{ + gm_context_t *ctx; + int nid; + int ret; + + ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); + if (!ctx) + return GM_RET_NOMEM; + + ctx->as = as; + ctx->dev = dev; + ctx->pmap = NULL; + ret = dev->mmu->pmap_create(dev, &ctx->pmap); + if (ret) { + kmem_cache_free(gm_ctx_cache, ctx); + return ret; + } + + INIT_LIST_HEAD(&ctx->gm_dev_link); + INIT_LIST_HEAD(&ctx->gm_as_link); + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); + list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); + + if (activate) { + /* + * Here we should really have a callback function to perform the context switch + * for the hardware. E.g. in x86 this function is effectively flushing the CR3 value. + * Currently we do not care time-sliced context switch, unless someone wants to support it. + */ + dev->current_ctx = ctx; + } + *out_ctx = ctx; + + /* + * gm_as_attach will be used to attach device to process address space. + * Handle this case and add hnodes registered by device to process mems_allowed. + */ + for_each_node_mask(nid, dev->registered_hnodes) + node_set(nid, current->mems_allowed); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_attach); + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + for_each_node(node) + node_set(node, hnode_map); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + spin_unlock(&hnode_lock); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, gm_dev_t *dev) +{ + hnodes[hnid] = hnode; + hnodes[hnid]->id = hnid; + hnodes[hnid]->dev = dev; + node_set(hnid, dev->registered_hnodes); + xa_init(&hnodes[hnid]->pages); +} + +void hnode_deinit(unsigned int hnid, gm_dev_t *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + xa_destroy(&hnodes[hnid]->pages); + hnodes[hnid] = NULL; +} + +struct prefetch_data { + struct mm_struct *mm; + gm_dev_t *dev; + unsigned long addr; + size_t size; + struct work_struct work; + int *res; +}; + +static void prefetch_work_cb(struct work_struct *work) +{ + struct prefetch_data *d = + container_of(work, struct prefetch_data, work); + unsigned long addr = d->addr, end = d->addr + d->size; + int page_size = HPAGE_SIZE; + int ret; + + do { + /* MADV_WILLNEED: dev will soon access this addr. */ + ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); + if (ret == GM_RET_PAGE_EXIST) { + pr_info("%s: device has done page fault, ignore prefetch\n", __func__); + } else if (ret != GM_RET_SUCCESS) { + *d->res = -EFAULT; + pr_err("%s: call dev fault error %d\n", __func__, ret); + } + } while (addr += page_size, addr != end); + + kfree(d); +} + +static int hmadvise_do_prefetch(gm_dev_t *dev, unsigned long addr, size_t size) +{ + unsigned long start, end, per_size; + int page_size = HPAGE_SIZE; + struct prefetch_data *data; + struct vm_area_struct *vma; + int res = GM_RET_SUCCESS; + + /* Align addr by rounding outward to make page cover addr. */ + end = round_up(addr + size, page_size); + start = round_down(addr, page_size); + size = end - start; + + mmap_read_lock(current->mm); + vma = find_vma(current->mm, start); + if (!vma || start < vma->vm_start || end > vma->vm_end) { + mmap_read_unlock(current->mm); + return GM_RET_FAILURE_UNKNOWN; + } + mmap_read_unlock(current->mm); + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (start < end) { + data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL); + if (!data) { + flush_workqueue(prefetch_wq); + return GM_RET_NOMEM; + } + + INIT_WORK(&data->work, prefetch_work_cb); + data->mm = current->mm; + data->dev = dev; + data->addr = start; + data->res = &res; + if (per_size == 0) + data->size = size; + else + /* Process (1.x * per_size) for the last time */ + data->size = (end - start < 2 * per_size) ? (end - start) : per_size; + queue_work(prefetch_wq, &data->work); + start += data->size; + } + + flush_workqueue(prefetch_wq); + return res; +} + +static int hmadvise_do_eagerfree(unsigned long addr, size_t size) +{ + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + unsigned long start, end; + gm_mapping_t *gm_mapping; + struct gm_fault_t gmf = { + .mm = current->mm, + .size = page_size, + .copy = false, + }; + vm_object_t *obj; + + /* Align addr by rounding inward to avoid excessive page release. */ + end = round_down(addr + size, page_size); + start = round_up(addr, page_size); + if (start >= end) + return ret; + + mmap_read_lock(current->mm); + do { + vma = find_vma(current->mm, start); + if (!vma || !vma_is_peer_shared(vma)) { + pr_err("gmem: not peer-shared vma, skip dontneed\n"); + continue; + } + obj = vma->vm_obj; + if (!obj) { + pr_err("gmem: peer-shared vma should have vm_object\n"); + mmap_read_unlock(current->mm); + return -EINVAL; + } + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, start); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } else if (gm_mapping_cpu(gm_mapping)) { + zap_page_range_single(vma, start, page_size, NULL); + } else { + gmf.va = start; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret) { + pr_err("gmem: peer_unmap failed. ret %d\n", ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + } + set_gm_mapping_nomap(gm_mapping); + mutex_unlock(&gm_mapping->lock); + } while (start += page_size, start != end); + + mmap_read_unlock(current->mm); + return ret; +} + +static bool check_hmadvise_behavior(int behavior) +{ + return behavior == MADV_DONTNEED; +} + +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) +{ + int error = -EINVAL; + struct hnode *node; + + if (hnid == -1) { + if (check_hmadvise_behavior(behavior)) { + goto no_hnid; + } else { + pr_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); + return error; + } + } + + if (hnid < 0) + return error; + + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) + return error; + + node = get_hnode(hnid); + if (!node) { + pr_err("hmadvise: hnode id %d is invalid\n", hnid); + return error; + } + +no_hnid: + switch (behavior) { + case MADV_PREFETCH: + return hmadvise_do_prefetch(node->dev, start, len_in); + case MADV_DONTNEED: + return hmadvise_do_eagerfree(start, len_in); + default: + pr_err("hmadvise: unsupported behavior %d\n", behavior); + } + + return error; +} +EXPORT_SYMBOL_GPL(hmadvise_inner); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 624671aaa60d0e0fdd551d7d14a039ecc9e2af7b..a55c88ba305df4956b4324ec2be71cc892fc7bff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,6 +37,9 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif #include #include @@ -656,6 +659,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; +#ifdef CONFIG_GMEM + gm_mapping_t *gm_mapping = NULL; + + if (vma_is_peer_shared(vma)) + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); +#endif VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); @@ -663,7 +672,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; } folio_throttle_swaprate(folio, gfp); @@ -673,7 +683,16 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, goto release; } +#ifdef CONFIG_GMEM + /* + * gmem device overcommit needs to reload the swapped page, + * so skip it to avoid clearing device data. + */ + if (!vma_is_peer_shared(vma) || !gm_mapping_cpu(gm_mapping)) + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); +#else clear_huge_page(page, vmf->address, HPAGE_PMD_NR); +#endif /* * The memory barrier inside __folio_mark_uptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -698,7 +717,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + goto gm_mapping_release; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -706,6 +725,14 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_add_new_anon_rmap(folio, vma, haddr); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && gm_mapping_device(gm_mapping)) { + vmf->page = page; + ret = gm_host_fault_locked(vmf, PE_SIZE_PMD); + if (ret) + goto unlock_release; + } +#endif set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -713,6 +740,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + set_gm_mapping_host(gm_mapping, page); + mutex_unlock(&gm_mapping->lock); + } +#endif } return 0; @@ -722,6 +755,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, if (pgtable) pte_free(vma->vm_mm, pgtable); folio_put(folio); +gm_mapping_release: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + mutex_unlock(&gm_mapping->lock); +#endif return ret; } @@ -780,17 +818,41 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; - struct folio *folio; + struct folio *folio = NULL; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret = 0; +#ifdef CONFIG_GMEM + gm_mapping_t *gm_mapping; + + if (vma_is_peer_shared(vma)) { + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (unlikely(!pmd_none(*vmf->pmd))) { + mutex_unlock(&gm_mapping->lock); + goto gm_mapping_release; + } + } +#endif - if (!transhuge_vma_suitable(vma, haddr)) - return VM_FAULT_FALLBACK; - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + if (!transhuge_vma_suitable(vma, haddr)) { + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; + } + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto gm_mapping_release; + } khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && + !vma_is_peer_shared(vma) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; struct page *zero_page; @@ -829,12 +891,32 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = vma_thp_gfp_mask(vma); + +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + if (vma_is_peer_shared(vma)) + gfp = GFP_TRANSHUGE; + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + } +#else folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); +#endif + if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); + +gm_mapping_release: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + mutex_unlock(&gm_mapping->lock); +#endif + return ret; } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index f69fbc2511984e224ab31f38a6315404b5d902b1..ed759cf1250b3e3e9dc45b7dcbdec4310790121d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,9 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif #include @@ -1522,6 +1525,47 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; } +#ifdef CONFIG_GMEM +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + gm_mapping_t *gm_mapping = NULL; + struct page *page = NULL; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + + if (gm_mapping && gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (page && (page_ref_count(page) != 0)) { + put_page(page); + gm_mapping->page = NULL; + } + } + xa_unlock(vma->vm_obj->logical_page_table); +} + +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + zap_logic_pmd_range(vma, addr, next); + } while (addr = next, addr != end); +} +#else +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +#endif + static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, @@ -1558,8 +1602,12 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pmd_range(vma, addr, next); goto next; + } + next = zap_pte_range(tlb, vma, pmd, addr, next, details); next: cond_resched(); @@ -1587,8 +1635,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, goto next; /* fall through */ } - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); @@ -1608,8 +1659,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none_or_clear_bad(p4d)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); @@ -1629,8 +1683,11 @@ void unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 40985c9d92d05217f42715ce4209ceb01c6b6bd5..5ed13fe2bd75397745ff60fb28ebcbe604644a92 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1716,7 +1716,11 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) +#else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) +#endif return false; /* diff --git a/mm/mmap.c b/mm/mmap.c index d600404580b2820183994fc44650a3e8344d4985..061cc7381233214a85fb3f79aabed087b6f69b13 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -48,6 +48,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -647,6 +651,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(dst)) + dup_vm_object(dst, src); +#endif dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } @@ -754,6 +762,41 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_GMEM +struct gmem_vma_list { + struct vm_area_struct *vma; + struct list_head list; +}; + +void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head) +{ + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + if (!node) { + pr_err("%s: fail to alloc memory\n", __func__); + return; + } + + node->vma = value; + list_add_tail(&node->list, head); +} + +void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + struct vm_area_struct *vma = node->vma; + + if (vma != NULL) + vm_area_free(vma); + + list_del(&node->list); + kfree(node); + } +} +#endif + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates @@ -1041,6 +1084,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(adjust)) + vm_object_adjust(adjust, adjust->vm_start + adj_start, + adjust->vm_end); +#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -1267,7 +1315,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + len = round_up(len, SZ_2M); + addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, + SZ_2M); + } else { + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } +#else addr = get_unmapped_area(file, addr, len, pgoff, flags); +#endif if (IS_ERR_VALUE(addr)) return addr; @@ -1391,6 +1449,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) + vm_flags |= VM_PEER_SHARED; +#endif + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1827,6 +1890,27 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +EXPORT_SYMBOL(get_unmapped_area_aligned); + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2268,6 +2352,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_mpol; +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + dup_vm_object(new, vma); +#endif + if (new->vm_file) get_file(new->vm_file); @@ -2279,6 +2368,18 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + if (new_below) { + vm_object_adjust(new, new->vm_start, addr); + vm_object_adjust(vma, addr, vma->vm_end); + } else { + vm_object_adjust(vma, vma->vm_start, addr); + vm_object_adjust(new, addr, new->vm_end); + } + } +#endif + if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; @@ -2318,6 +2419,72 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +#ifdef CONFIG_GMEM +static void munmap_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + unsigned long addr = start; + vm_object_t *obj = vma->vm_obj; + gm_ret_t ret; + gm_context_t *ctx, *tmp; + gm_mapping_t *gm_mapping; + + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + if (!obj) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) { + pr_err("%s: call dev peer_unmap error %d\n", __func__, ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + if (!mm->gm_as) + return; + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", + start, end - start, ret); + } +} +#endif + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2401,6 +2568,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } next = vma_next(vmi); + +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_in_peer_devices(mm, vma, start, end); +#endif + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2509,6 +2682,18 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; + if (gmem_is_enabled()) { + vma = find_vma(mm, start); + if (!vma) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + + len = round_up(len, SZ_2M); + } + } + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2541,6 +2726,57 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } +#ifdef CONFIG_GMEM +static int alloc_va_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, unsigned long len, + vm_flags_t vm_flags) +{ + gm_context_t *ctx, *tmp; + gm_prot_t prot = VM_NONE; + gm_ret_t ret; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .size = len, + .prot = prot, + }; + + pr_debug("gmem: start mmap, as %p\n", mm->gm_as); + if (!mm->gm_as) + return -ENODEV; + + prot |= vm_flags; + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + /* + * TODO: consider the concurrency problem of device + * attaching/detaching from the gm_as. + */ + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + gmf.dev = ctx->dev; + + pr_debug("gmem: call vma_alloc\n"); + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + return ret; + } + } + + return GM_RET_SUCCESS; +} +#endif + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) @@ -2555,6 +2791,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); + +retry: +#endif /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -2567,21 +2809,33 @@ unsigned long mmap_region(struct file *file, unsigned long addr, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) + (len >> PAGE_SHIFT) - nr_pages)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) + if (security_vm_enough_memory_mm(mm, charged)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } vm_flags |= VM_ACCOUNT; } @@ -2736,6 +2990,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr, file = vma->vm_file; ksm_add_vma(vma); expanded: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + + if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + addr = get_unmapped_area(file, addr, len, pgoff, 0); + gmem_reserve_vma(vma, &reserve_list); + goto retry; + } else if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + error = -ENOMEM; + goto free_vma; + } + gmem_release_vma(mm, &reserve_list); + } +#endif perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -2785,6 +3056,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif validate_mm(mm); return error; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc12b7a99db2ac595f3e33fc20999aecf..90762bee97306f48a3d5329b9b2292fb3a3c953c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -216,6 +216,9 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = NODE_MASK_NONE, +#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM diff --git a/mm/vm_object.c b/mm/vm_object.c new file mode 100644 index 0000000000000000000000000000000000000000..ac1a115e4ee13d1b5a84c8a705caea0d132c534c --- /dev/null +++ b/mm/vm_object.c @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Logical Mapping Management + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi zhu, chao Liu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA + * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA + * gets changed: merge, split, adjust + */ +static struct kmem_cache *vm_object_cachep; +static struct kmem_cache *gm_mapping_cachep; + +/* gm_mapping will not be release dynamically */ +gm_mapping_t *alloc_gm_mapping(void) +{ + gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + + if (!gm_mapping) + return NULL; + + set_gm_mapping_nomap(gm_mapping); + mutex_init(&gm_mapping->lock); + + return gm_mapping; +} +EXPORT_SYMBOL(alloc_gm_mapping); + +static inline void release_gm_mapping(gm_mapping_t *mapping) +{ + kmem_cache_free(gm_mapping_cachep, mapping); +} + +static inline gm_mapping_t *lookup_gm_mapping(vm_object_t *obj, unsigned long pindex) +{ + return xa_load(obj->logical_page_table, pindex); +} + +int __init vm_object_init(void) +{ + vm_object_cachep = KMEM_CACHE(vm_object, 0); + if (!vm_object_cachep) + goto out; + + gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0); + if (!gm_mapping_cachep) + goto free_vm_object; + + return 0; +free_vm_object: + kmem_cache_destroy(vm_object_cachep); +out: + return -ENOMEM; +} + +/* + * Create a VM_OBJECT and attach it to a VMA + * This should be called when a VMA is created. + */ +vm_object_t *vm_object_create(struct vm_area_struct *vma) +{ + vm_object_t *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL); + + if (!obj) + return NULL; + + spin_lock_init(&obj->lock); + obj->vma = vma; + + /* + * The logical page table maps linear_page_index(obj->vma, va) + * to pointers of struct gm_mapping. + */ + obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL); + if (!obj->logical_page_table) { + kmem_cache_free(vm_object_cachep, obj); + return NULL; + } + + xa_init(obj->logical_page_table); + atomic_set(&obj->nr_pages, 0); + atomic_set(&obj->ref_count, 1); + + return obj; +} + +/* This should be called when a VMA no longer refers to a VM_OBJECT */ +void vm_object_drop_locked(struct vm_area_struct *vma) +{ + vm_object_t *obj = vma->vm_obj; + + if (!obj) { + pr_err("vm_object: vm_obj of the vma is NULL\n"); + return; + } + + /* + * We must enter this with VMA write-locked, which is unfortunately a giant lock. + * Note that Linux 6.0 has per-VMA lock: + * https://lwn.net/Articles/906852/ + * https://lwn.net/Articles/906833/ + */ + free_gm_mappings(vma); + mmap_assert_write_locked(vma->vm_mm); + vma->vm_obj = NULL; + + if (atomic_dec_and_test(&obj->ref_count)) { + xa_destroy(obj->logical_page_table); + kfree(obj->logical_page_table); + kmem_cache_free(vm_object_cachep, obj); + } +} + +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + unsigned long index; + gm_mapping_t *mapping; + unsigned long moved_pages = 0; + + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); + + xa_lock(dst->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) { + index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff + + ((src->vm_start - dst->vm_start) >> PAGE_SHIFT); + __xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &dst->vm_obj->nr_pages); + xa_unlock(dst->vm_obj->logical_page_table); +} + +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ + unsigned long removed_pages = 0; + gm_mapping_t *mapping; + + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xas_lock(&xas); + if (vma->vm_start < start) { + xas_for_each(&xas, mapping, linear_page_index(vma, start)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + + if (vma->vm_end > end) { + xas_set(&xas, linear_page_index(vma, end)); + + xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + atomic_sub(removed_pages, &vma->vm_obj->nr_pages); + xas_unlock(&xas); +} + +/* + * Given a VA, the page_index is computed by + * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address) + */ +struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) +{ + return lookup_gm_mapping(obj, linear_page_index(obj->vma, va)); +} +EXPORT_SYMBOL_GPL(vm_object_lookup); + +void vm_object_mapping_create(vm_object_t *obj, gm_va_t start) +{ + pgoff_t index = linear_page_index(obj->vma, start); + gm_mapping_t *gm_mapping; + + gm_mapping = alloc_gm_mapping(); + if (!gm_mapping) + return; + + __xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL); +} + +void free_gm_mappings(struct vm_area_struct *vma) +{ + gm_mapping_t *gm_mapping; + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) { + release_gm_mapping(gm_mapping); + xas_store(&xas, NULL); + } + xa_unlock(vma->vm_obj->logical_page_table); +}