diff --git a/drivers/base/node.c b/drivers/base/node.c
index b46db17124f346aff070bf839b606046720b9854..4943a25cc272e8a47d102f468a0437c6bdf6f19f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -922,6 +922,9 @@ static struct node_attr node_state_attr[] = {
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 	[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
 					   N_GENERIC_INITIATOR),
+#ifdef CONFIG_GMEM
+	[N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS),
+#endif
 };
 
 static struct attribute *node_state_attrs[] = {
@@ -934,6 +937,9 @@ static struct attribute *node_state_attrs[] = {
 	&node_state_attr[N_MEMORY].attr.attr,
 	&node_state_attr[N_CPU].attr.attr,
 	&node_state_attr[N_GENERIC_INITIATOR].attr.attr,
+#ifdef CONFIG_GMEM
+	&node_state_attr[N_HETEROGENEOUS].attr.attr,
+#endif
 	NULL
 };
 
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 801d6c83f896163426f073400937b2de4e290215..f485063c5818aebb6c7d4272acd4ee97b9d3f536 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -421,4 +421,11 @@ config ADI
 	  and SSM (Silicon Secured Memory).  Intended consumers of this
 	  driver include crash and makedumpfile.
 
+config GMEM_DEV
+	tristate "driver for gmem"
+	depends on GMEM
+	default m
+	help
+	  driver for gmem in order to pass ioctl commands.
+
 endmenu
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index c5f532e412f1a4b93100ad51e5662563d7f3ab25..4d01ef364c8b3037244a3df8e87a9a5c590ba95a 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -44,3 +44,4 @@ obj-$(CONFIG_PS3_FLASH)		+= ps3flash.o
 obj-$(CONFIG_XILLYBUS_CLASS)	+= xillybus/
 obj-$(CONFIG_POWERNV_OP_PANEL)	+= powernv-op-panel.o
 obj-$(CONFIG_ADI)		+= adi.o
+obj-$(CONFIG_GMEM_DEV)		+= gmem_dev.o
diff --git a/drivers/char/gmem_dev.c b/drivers/char/gmem_dev.c
new file mode 100644
index 0000000000000000000000000000000000000000..225ed506a9c095ac3daeaa5ce92e8ee304a61d13
--- /dev/null
+++ b/drivers/char/gmem_dev.c
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/gmem_dev.h>
+
+static int gmem_get_hnid(unsigned long arg)
+{
+	void __user *buf = (void __user *)arg;
+	struct gmem_hnid_arg gmem_hnid;
+	gm_context_t *ctx, *tmp;
+	gm_dev_t *gm_dev = NULL;
+	gm_as_t *as = NULL;
+	int hnuma_id;
+
+	if (!access_ok(buf, sizeof(struct gmem_hnid_arg))) {
+		pr_err("access_ok failed\n");
+		return -EFAULT;
+	}
+
+	if (copy_from_user(&gmem_hnid, buf, sizeof(struct gmem_hnid_arg))) {
+		pr_err("copy_from_user failed.\n");
+		return -EFAULT;
+	}
+
+	if (!current->mm) {
+		pr_err("current's mm is null.\n");
+		return -EFAULT;
+	}
+
+	as = current->mm->gm_as;
+	if (!as) {
+		pr_err("current isn't gmem task failed.\n");
+		return -ENODEV;
+	}
+
+	list_for_each_entry_safe(ctx, tmp, &as->gm_ctx_list, gm_as_link) {
+		gm_dev = ctx->dev;
+		if (gm_dev)
+			break;
+	}
+
+	if (!gm_dev) {
+		pr_err("gmem_id_to_device failed.\n");
+		return -ENODEV;
+	}
+
+	hnuma_id = first_node(gm_dev->registered_hnodes);
+	if (copy_to_user(gmem_hnid.hnuma_id, &hnuma_id, sizeof(int))) {
+		pr_err("copy_to_user failed.\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int gmem_hmadvise(unsigned long arg)
+{
+	struct hmadvise_arg harg;
+	void __user *buf;
+	int ret;
+
+	buf = (void __user *)arg;
+	if (!access_ok(buf, sizeof(struct hmadvise_arg))) {
+		pr_err("access_ok failed.\n");
+		return -EFAULT;
+	}
+
+	if (copy_from_user(&harg, buf, sizeof(struct hmadvise_arg))) {
+		pr_err("copy_from_user failed.\n");
+		return -EFAULT;
+	}
+
+	ret = hmadvise_inner(harg.hnid, harg.start, harg.len_in, harg.behavior);
+	return ret;
+}
+
+static long gmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long ret = 0;
+
+	if (_IOC_TYPE(cmd) != GMEM_MAGIC) {
+		pr_err("invalid cmd magic number '%#x', should '%#x'.\n",
+				_IOC_TYPE(cmd), GMEM_MAGIC);
+		return -EINVAL;
+	}
+
+	switch (cmd) {
+	case GMEM_GET_HNUMA_ID:
+		ret = gmem_get_hnid(arg);
+		break;
+	case GMEM_MADVISE:
+		ret = gmem_hmadvise(arg);
+		break;
+	default:
+		pr_err("invalid cmd '%#x'.\n", cmd);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static const struct file_operations gmem_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl	= gmem_ioctl,
+	.compat_ioctl	= gmem_ioctl,
+};
+
+static struct miscdevice gmem_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "gmem",
+	.fops	= &gmem_fops,
+};
+
+builtin_misc_device(gmem_miscdev);
diff --git a/include/linux/device.h b/include/linux/device.h
index 472dd24d4823a6f09757ff505b6df7da1c60a5b7..6a4c901119c896ddbefbf909de25d183d594d658 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -32,6 +32,10 @@
 #include <linux/device/driver.h>
 #include <asm/device.h>
 
+#ifdef CONFIG_GMEM
+#include <linux/gmem.h>
+#endif
+
 struct device;
 struct device_private;
 struct device_driver;
@@ -655,6 +659,9 @@ struct device {
 #ifdef CONFIG_DMA_OPS_BYPASS
 	bool			dma_ops_bypass : 1;
 #endif
+#ifdef CONFIG_GMEM
+	gm_dev_t		*gm_dev;
+#endif
 };
 
 /**
diff --git a/include/linux/gmem.h b/include/linux/gmem.h
new file mode 100644
index 0000000000000000000000000000000000000000..128d9c4d88fd713e52ea7de95f60dee8624151fe
--- /dev/null
+++ b/include/linux/gmem.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generalized Memory Management.
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi Zhu
+ *
+ */
+#ifndef _GMEM_H
+#define _GMEM_H
+
+#include <linux/mm.h>
+
+typedef unsigned long gm_region_placement_t;
+typedef unsigned long gm_prot_t;
+typedef enum gm_ret gm_ret_t;
+typedef struct gm_region gm_region_t;
+typedef struct gm_mapping_set gm_mapping_set_t;
+typedef enum gm_mmu_mode gm_mmu_mode_t;
+typedef struct gm_mmu gm_mmu_t;
+typedef unsigned long gm_dev_cap_t;
+typedef struct gm_context gm_context_t;
+typedef struct gm_dev gm_dev_t;
+typedef struct gm_mapping gm_mapping_t;
+
+struct hnode;
+
+/*
+ * enum gm_ret - The return value of GMEM KPI that can be used to tell
+ * the core VM or peripheral driver whether the GMEM KPI was
+ * executed successfully.
+ *
+ * @GM_RET_SUCCESS:	The invoked GMEM KPI behaved as expected.
+ * @GM_RET_FAILURE_UNKNOWN:	The GMEM KPI failed with unknown reason.
+ * Any external status related to this KPI invocation changes must be rolled back.
+ */
+enum gm_ret {
+	GM_RET_SUCCESS = 0,
+	GM_RET_NOMEM,
+	GM_RET_PAGE_EXIST,
+	GM_RET_DMA_ERROR,
+	GM_RET_MIGRATING,
+	GM_RET_FAILURE_UNKNOWN,
+	GM_RET_UNIMPLEMENTED,
+};
+
+/*
+ * Defines a contiguous range of virtual addresses inside a gm_as_t
+ * As an analogy, this is conceptually similar as virtual_address_struct
+ */
+struct gm_region {
+	gm_va_t start_va;
+	gm_va_t end_va;
+	struct rb_node node;
+	gm_as_t *as; /* The address space that it belongs to */
+
+	/* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */
+	struct list_head mapping_set_link;
+
+	void (*callback_op)(void *args);
+	void *cb_args;
+};
+
+/* This holds a list of regions that must not be concurrently manipulated. */
+struct gm_mapping_set {
+	unsigned int region_cnt;
+	struct list_head gm_region_list;
+};
+
+/**
+ * enum gm_mmu_mode - defines the method to share a physical page table.
+ *
+ * @GM_MMU_MODE_SHARE: Literally share a physical page table with another
+ * attached device's MMU. Nothing is guaranteed about the allocated address.
+ * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds
+ * exclusive mapping entries, so that device memory accesses can trigger fault-driven
+ * migration for automatic data locality optimizations.
+ * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical
+ * mapping entries whenever a physical mapping is installed inside the address space, so
+ * that it may minimize the page faults to be triggered by this device.
+ */
+enum gm_mmu_mode {
+	GM_MMU_MODE_SHARE,
+	GM_MMU_MODE_COHERENT_EXCLUSIVE,
+	GM_MMU_MODE_REPLICATE,
+};
+
+/*
+ * This is the parameter list of peer_map/unmap mmu operations.
+ * if device should copy data to/from host, set copy and dma_addr
+ */
+struct gm_fault_t {
+	struct mm_struct *mm;
+	gm_dev_t *dev;
+	gm_va_t va;
+	gm_pa_t size;
+	gm_prot_t prot;
+	bool copy;
+	dma_addr_t dma_addr;
+	int behavior;
+};
+
+struct gm_memcpy_t {
+	struct mm_struct *mm;
+	gm_dev_t *dev;
+	gm_va_t src;
+	gm_va_t dest;
+	dma_addr_t dma_addr;
+	size_t size;
+};
+
+/**
+ *
+ * This struct defines a series of MMU functions registered by a peripheral
+ * device that is to be invoked by GMEM.
+ *
+ * pmap is an opaque pointer that identifies a physical page table of a device.
+ * A physical page table holds the physical mappings that can be interpreted by
+ * the hardware MMU.
+ */
+struct gm_mmu {
+	/*
+	 * Each bit indicates a supported page size for page-based TLB.
+	 * Currently we do not consider range TLBs.
+	 */
+	unsigned long pgsize_bitmap;
+
+	/*
+	 * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie,
+	 * then it means their page table formats are compatible.
+	 * In that case, they can share the same void *pmap as the input arg.
+	 */
+	unsigned long cookie;
+
+	/* Synchronize VMA in a peer OS to interact with the host OS */
+	gm_ret_t (*peer_va_alloc_fixed)(struct gm_fault_t *gmf);
+	gm_ret_t (*peer_va_free)(struct gm_fault_t *gmf);
+
+	/* Create physical mappings on peer host.
+	 * If copy is set, copy data [dma_addr, dma_addr + size] to peer host
+	 */
+	gm_ret_t (*peer_map)(struct gm_fault_t *gmf);
+	/*
+	 * Destroy physical mappings on peer host.
+	 * If copy is set, copy data back to [dma_addr, dma_addr + size]
+	 */
+	gm_ret_t (*peer_unmap)(struct gm_fault_t *gmf);
+
+	/* Create or destroy a device's physical page table. */
+	gm_ret_t (*pmap_create)(gm_dev_t *dev, void **pmap);
+	gm_ret_t (*pmap_destroy)(void *pmap);
+
+	/* Create or destroy a physical mapping of a created physical page table */
+	gm_ret_t (*pmap_enter)(void *pmap, gm_va_t va, gm_va_t size,
+			     gm_pa_t pa, gm_prot_t prot);
+	gm_ret_t (*pmap_release)(void *pmap, gm_va_t va, gm_va_t size);
+
+	/* Change the protection of a virtual page */
+	gm_ret_t (*pmap_protect)(void *pmap, gm_va_t va, gm_va_t size, gm_prot_t new_prot);
+
+	/* Invalidation functions of the MMU TLB */
+	gm_ret_t (*tlb_invl)(void *pmap, gm_va_t va, gm_va_t size);
+	gm_ret_t (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings);
+};
+
+/**
+ * gm_dev_cap_t defines a composable flag to describe the capabilities of a device.
+ *
+ * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults.
+ * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS
+ */
+#define GM_DEV_CAP_REPLAYABLE	0x00000001
+#define GM_DEV_CAP_PEER		0x00000010
+
+#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0)
+
+struct gm_context {
+	gm_as_t *as;
+	gm_dev_t *dev;
+	void *pmap;
+	/*
+	 * consider a better container to maintain multiple ctx inside a device or multiple ctx
+	 * inside a va space.
+	 * A device may simultaneously have multiple contexts for time-sliced ctx switching
+	 */
+	struct list_head gm_dev_link;
+
+	/* A va space may have multiple gm_context */
+	struct list_head gm_as_link;
+};
+#define get_gm_context(head) (list_entry((head)->prev, gm_context_t, ctx_link))
+
+struct gm_dev {
+	int id;
+
+	/* identifies the device capability
+	 * For example, whether the device supports page faults or whether it has its
+	 * own OS that manages the VA and PA resources.
+	 */
+	gm_dev_cap_t capability;
+	gm_mmu_t *mmu;
+	void *dev_data;
+	/*
+	 * TODO: Use a better container of gm_context_t to support time-sliced context switch.
+	 * A collection of device contexts. If the device does not support time-sliced context
+	 * switch, then the size of the collection should never be greater than one.
+	 * We need to think about what operators should the container be optimized for.
+	 * A list, a radix-tree or what? What would gm_dev_activate require?
+	 * Are there any accelerators that are really going to support time-sliced context switch?
+	 */
+	gm_context_t *current_ctx;
+
+	struct list_head gm_ctx_list;
+
+	/* Add tracking of registered device local physical memory. */
+	nodemask_t registered_hnodes;
+	struct device *dma_dev;
+
+	gm_mapping_t *gm_mapping;
+};
+
+#define HOST_NODE_ID	(-1)
+
+#define GM_PAGE_DIRTY	0x8 /* Whether the page is dirty */
+#define GM_PAGE_CPU	0x10 /* Determines whether page is a pointer or a pfn number. */
+#define GM_PAGE_DEVICE	0x20
+#define GM_PAGE_NOMAP	0x40
+#define GM_PAGE_PINNED	0x80
+#define GM_PAGE_WILLNEED	0x100
+
+#define GM_PAGE_TYPE_MASK	(GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP)
+
+/* Records the status of a page-size physical page */
+struct gm_mapping {
+	/*
+	 * The node index may have three definitions:
+	 * 1. a common CPU node
+	 * 2. a hetero-node, e.g. GPU (that not necessarily supports CC ld/st)
+	 * 3. a network ip (another OS that may have multiple hNUMA nodes), dynamically attached by dsm_attach
+	 * Among these definitions, #1 and #2 in combination defines an h-NUMA topology
+	 */
+	unsigned int node_id;
+
+	unsigned int flag;
+
+	union {
+		struct page *page;	/* CPU node */
+		gm_dev_t *dev;	/* hetero-node */
+		gm_pa_t pfn;
+	};
+
+	struct mutex lock;
+};
+
+static inline bool gm_mapping_cpu(gm_mapping_t *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_CPU);
+}
+
+static inline void set_gm_mapping_host(gm_mapping_t *gm_mapping, struct page *page)
+{
+	gm_mapping->node_id = HOST_NODE_ID;
+	gm_mapping->flag &= ~GM_PAGE_TYPE_MASK;
+	gm_mapping->flag |= GM_PAGE_CPU;
+	gm_mapping->page = page;
+}
+
+static inline bool gm_mapping_device(gm_mapping_t *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_DEVICE);
+}
+
+static inline void set_gm_mapping_device(gm_mapping_t *gm_mapping, gm_dev_t *dev)
+{
+	gm_mapping->flag &= ~GM_PAGE_TYPE_MASK;
+	gm_mapping->flag |= GM_PAGE_DEVICE;
+	gm_mapping->dev = dev;
+}
+
+static inline bool gm_mapping_nomap(gm_mapping_t *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_NOMAP);
+}
+
+static inline void set_gm_mapping_nomap(gm_mapping_t *gm_mapping)
+{
+	gm_mapping->flag &= ~GM_PAGE_TYPE_MASK;
+	gm_mapping->flag |= GM_PAGE_NOMAP;
+	gm_mapping->page = NULL;
+}
+
+static inline void set_gm_mapping_willneed(gm_mapping_t *gm_mapping)
+{
+	gm_mapping->flag |= GM_PAGE_WILLNEED;
+}
+
+static inline void clear_gm_mapping_willneed(gm_mapping_t *gm_mapping)
+{
+	gm_mapping->flag &= ~GM_PAGE_WILLNEED;
+}
+
+static inline bool gm_mapping_willneed(gm_mapping_t *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_WILLNEED);
+}
+
+static inline void set_gm_mapping_pinned(gm_mapping_t *gm_mapping)
+{
+	gm_mapping->flag |= GM_PAGE_PINNED;
+}
+
+static inline void clear_gm_mapping_pinned(gm_mapping_t *gm_mapping)
+{
+	gm_mapping->flag &= ~GM_PAGE_PINNED;
+}
+
+static inline bool gm_mapping_pinned(gm_mapping_t *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_PINNED);
+}
+
+#define test_gm_mapping_mapped_on_node(i) { /* implement this */ }
+#define set_gm_mapping_mapped_on_node(i) { /* implement this */ }
+#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ }
+
+/* GMEM Device KPI */
+extern gm_ret_t gm_dev_create(gm_mmu_t *mmu, void *dev_data, gm_dev_cap_t cap, gm_dev_t **new_dev);
+extern gm_ret_t gm_dev_destroy(gm_dev_t *dev);
+extern gm_ret_t gm_dev_switch(gm_dev_t *dev, gm_as_t *as);
+extern gm_ret_t gm_dev_detach(gm_dev_t *dev, gm_as_t *as);
+extern gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end);
+gm_ret_t gm_dev_fault(struct mm_struct *mm, gm_va_t addr, gm_dev_t *dev, int behavior);
+vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size);
+
+/* GMEM address space KPI */
+extern gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end);
+extern void gm_dev_unregister_physmem(gm_dev_t *dev, unsigned int nid);
+extern gm_mapping_t *gm_mappings_alloc(unsigned int nid, unsigned int order);
+extern void gm_mappings_free(gm_mapping_t *mapping, unsigned int order);
+extern gm_ret_t gm_as_create(gm_va_t begin, gm_va_t end, gm_as_alloc_t policy, gm_va_t cache_quantum, gm_as_t **new_as);
+extern gm_ret_t gm_as_destroy(gm_as_t *as);
+extern gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode, bool activate, gm_context_t **out_ctx);
+extern gm_va_t gm_as_alloc(gm_as_t *as, gm_va_t hint, gm_va_t size, gm_va_t align, gm_va_t no_cross,
+		gm_va_t max_va, gm_region_t **new_region);
+
+extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior);
+
+enum gmem_stat_item {
+	NR_PAGE_MIGRATING,
+	NR_GMEM_STAT_ITEMS
+};
+
+extern void gmem_state_counter(enum gmem_stat_item item, int val);
+extern void gmem_state_counter_show(void);
+
+/* h-NUMA topology */
+struct hnode {
+	unsigned int id;
+
+	gm_dev_t *dev;
+
+	struct xarray pages;
+};
+
+extern struct hnode *hnodes[];
+
+static inline bool is_hnode(int node)
+{
+	return !node_isset(node, node_possible_map)
+		&& node_isset(node, hnode_map);
+}
+
+static inline bool is_hnode_allowed(int node)
+{
+	return is_hnode(node) && node_isset(node, current->mems_allowed);
+}
+
+static inline struct hnode *get_hnode(unsigned int hnid)
+{
+	return hnodes[hnid];
+}
+
+void __init hnuma_init(void);
+unsigned int alloc_hnode_id(void);
+void free_hnode_id(unsigned int nid);
+void hnode_init(struct hnode *hnode, unsigned int hnid, gm_dev_t *dev);
+void hnode_deinit(unsigned int hnid, gm_dev_t *dev);
+
+#endif /* _GMEM_H */
diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h
new file mode 100644
index 0000000000000000000000000000000000000000..f62dac65e2edca1e37c96e407b60e876f2c9d8b8
--- /dev/null
+++ b/include/linux/gmem_as.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _GMEM_AS_H
+#define _GMEM_AS_H
+
+typedef struct gm_as gm_as_t;
+typedef unsigned long gm_va_t;
+typedef unsigned long gm_pa_t;
+typedef enum gm_as_alloc gm_as_alloc_t;
+
+#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */
+
+/**
+ * enum gm_as_alloc - defines different allocation policy for virtual addresses.
+ *
+ * @GM_AS_ALLOC_DEFAULT:		An object cache is applied to accelerate VA allocations.
+ * @GM_AS_ALLOC_FIRSTFIT:		Prefer allocation efficiency.
+ * @GM_AS_ALLOC_BESTFIT:		Prefer space efficiency.
+ * @GM_AS_ALLOC_NEXTFIT:		Perform an address-ordered search for free addresses,
+ * beginning where the previous search ended.
+ */
+enum gm_as_alloc {
+	GM_AS_ALLOC_DEFAULT = 0,
+	GM_AS_ALLOC_FIRSTFIT,
+	GM_AS_ALLOC_BESTFIT,
+	GM_AS_ALLOC_NEXTFIT,
+};
+
+/* Defines an address space. */
+struct gm_as {
+	spinlock_t rbtree_lock; /* spinlock of gm_as_t */
+	struct rb_root rbroot; /*root of gm_region_t */
+	gm_as_alloc_t policy;
+	gm_va_t start_va;
+	gm_va_t end_va;
+	gm_va_t cache_quantum; /* defines the VA unit size if an object cache is applied */
+
+	struct list_head gm_ctx_list; /* tracks device contexts attached to this va space, using gm_as_link */
+};
+
+#endif
diff --git a/include/linux/gmem_dev.h b/include/linux/gmem_dev.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1359f19b85070ab3005a8740b736ebf3207ee09
--- /dev/null
+++ b/include/linux/gmem_dev.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __GMEM_DEV_H
+#define __GMEM_DEV_H
+
+#include <uapi/linux/gmem_dev.h>
+
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27ce77080c79c7a026e641e491246fcf6f7e26c0..50f04282efcb1ecb76cf76e43031ee33310dff67 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -324,6 +324,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
+#ifdef CONFIG_GMEM
+#define VM_PEER_SHARED	BIT(56)
+#else
+#define VM_PEER_SHARED	VM_NONE
+#endif
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -3127,6 +3132,9 @@ unsigned long randomize_stack_top(unsigned long stack_top);
 unsigned long randomize_page(unsigned long start, unsigned long range);
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+extern unsigned long get_unmapped_area_aligned(struct file *file,
+	unsigned long addr, unsigned long len, unsigned long pgoff,
+	unsigned long flags, unsigned long align);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
@@ -3816,4 +3824,27 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 }
 #endif
 
+#ifdef CONFIG_GMEM
+DECLARE_STATIC_KEY_FALSE(gmem_status);
+
+static inline bool gmem_is_enabled(void)
+{
+	return static_branch_likely(&gmem_status);
+}
+
+static inline bool vma_is_peer_shared(struct vm_area_struct *vma)
+{
+	if (!gmem_is_enabled())
+		return false;
+
+	return !!(vma->vm_flags & VM_PEER_SHARED);
+}
+#else
+static inline bool gmem_is_enabled(void) { return false; }
+static inline bool vma_is_peer_shared(struct vm_area_struct *vma)
+{
+	return false;
+}
+#endif
+
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 306a3d1a0fa655af73e3b3f37446dea60e4fe902..cac73ccf7367a3e7fd876b3fc0d640ec981a1ad7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,10 @@
 #include <linux/seqlock.h>
 #include <linux/percpu_counter.h>
 
+#ifdef CONFIG_GMEM
+#include <linux/gmem_as.h>
+#endif
+
 #include <asm/mmu.h>
 
 #ifndef AT_VECTOR_SIZE_ARCH
@@ -465,6 +469,44 @@ struct vm_userfaultfd_ctx {
 struct vm_userfaultfd_ctx {};
 #endif /* CONFIG_USERFAULTFD */
 
+#ifdef CONFIG_GMEM
+/*
+ * Defines a centralized logical mapping table that reflects the mapping information
+ * regardless of the underlying arch-specific MMUs.
+ * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well
+ * as the filemap address_space struct from Linux page cache.
+ * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that
+ * the coordiantion between page tables must happen with CPU page table involved. That
+ * is to say, a generalized process unit must involve in a UVA-programming model, otherwise
+ * there is no point to support UVA programming.
+ * However, a VMA only needs to maintain logical mappings if the process has been
+ * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless
+ * we later build a reservation system on top of the logical mapping tables to support
+ * reservation-based superpages and rangeTLBs).
+ * A GM_REGION does not need to maintain logical mappings. In the case that a device wants
+ * to support its private address space with local physical memory, GMEM should forward address
+ * space management to the core VM, using VMAs, instead of using GM_REGIONs.
+ */
+struct vm_object {
+	spinlock_t lock;
+	struct vm_area_struct *vma;
+
+	/*
+	 * The logical_page_table is a container that holds the mapping
+	 * information between a VA and a struct page.
+	 */
+	struct xarray *logical_page_table;
+	atomic_t nr_pages;
+
+	/*
+	 * a vm object might be referred by multiple VMAs to share
+	 * memory.
+	 */
+	atomic_t ref_count;
+};
+typedef struct vm_object vm_object_t;
+#endif
+
 struct anon_vma_name {
 	struct kref kref;
 	/* The name needs to be at the end because it is dynamically sized. */
@@ -571,6 +613,9 @@ struct vm_area_struct {
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_GMEM
+	struct vm_object *vm_obj;
+#endif
 } __randomize_layout;
 
 #ifdef CONFIG_SCHED_MM_CID
@@ -802,6 +847,9 @@ struct mm_struct {
 #endif
 		} lru_gen;
 #endif /* CONFIG_LRU_GEN */
+#ifdef CONFIG_GMEM
+	gm_as_t *gm_as;
+#endif
 	} __randomize_layout;
 
 	/*
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index bb0ee80526b2dc30023cdda7dc6a8022f81727a4..39820123d7934456c2ad0fee44d13a607b6dd23b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -407,6 +407,9 @@ enum node_states {
 	N_MEMORY,		/* The node has memory(regular, high, movable) */
 	N_CPU,		/* The node has one or more cpus */
 	N_GENERIC_INITIATOR,	/* The node has one or more Generic Initiators */
+#ifdef CONFIG_GMEM
+	N_HETEROGENEOUS,	/* The node has heterogeneous memory */
+#endif
 	NR_NODE_STATES
 };
 
@@ -536,6 +539,13 @@ static inline int node_random(const nodemask_t *maskp)
 #define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 
+#ifdef CONFIG_GMEM
+/* For h-NUMA topology */
+#define hnode_map		node_states[N_HETEROGENEOUS]
+#define num_hnodes()		num_node_state(N_HETEROGENEOUS)
+#define for_each_hnode(node)	for_each_node_state(node, N_HETEROGENEOUS)
+#endif
+
 /*
  * For nodemask scratch area.
  * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h
new file mode 100644
index 0000000000000000000000000000000000000000..10bb7317803c0a973f50c9f0bfb014f045516180
--- /dev/null
+++ b/include/linux/vm_object.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VM_OBJECT_H
+#define _VM_OBJECT_H
+
+#include <linux/mm_types.h>
+#include <linux/gmem.h>
+
+#ifdef CONFIG_GMEM
+/* vm_object KPI */
+int __init vm_object_init(void);
+vm_object_t *vm_object_create(struct vm_area_struct *vma);
+void vm_object_drop_locked(struct vm_area_struct *vma);
+void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src);
+void vm_object_adjust(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end);
+
+gm_mapping_t *alloc_gm_mapping(void);
+struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va);
+void vm_object_mapping_create(vm_object_t *obj, gm_va_t start);
+void free_gm_mappings(struct vm_area_struct *vma);
+#else
+static inline void __init vm_object_init(void) {}
+static inline vm_object_t *vm_object_create(struct vm_area_struct *vma) { return NULL; }
+static inline void vm_object_drop_locked(struct vm_area_struct *vma) {}
+static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end) {}
+
+static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; }
+static inline struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va) { return NULL; }
+static inline void vm_object_mapping_create(vm_object_t *obj, gm_va_t start) {}
+static inline void free_gm_mappings(struct vm_area_struct *vma) {}
+#endif
+
+#endif /* _VM_OBJECT_H */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432c64599f706b86e74a12581c2a54e..9f6ee16d18847cb34173cc3561e0be0d67ffc7ce 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -33,6 +33,8 @@
 #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
 					 * uninitialized */
 
+#define MAP_PEER_SHARED		0x8000000
+
 /*
  * Flags for mlock
  */
@@ -79,6 +81,11 @@
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+/* for hmadvise */
+#define MADV_GMEM_BASE	0x1000
+#define MADV_PREFETCH	MADV_GMEM_BASE		/* prefetch pages for hNUMA node */
+#define MADV_PINNED	(MADV_GMEM_BASE+1)	/* pin these pages */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/include/uapi/linux/gmem_dev.h b/include/uapi/linux/gmem_dev.h
new file mode 100644
index 0000000000000000000000000000000000000000..015792660a62059effd7f9dc438af16b1fd1c2e8
--- /dev/null
+++ b/include/uapi/linux/gmem_dev.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_GMEM_DEV_H
+#define _UAPI_LINUX_GMEM_DEV_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define GMEM_MAGIC 0x55
+
+#define _GMEM_GET_HNUMA_ID	1
+#define _GMEM_MADVISE		2
+
+struct gmem_hnid_arg {
+	int *hnuma_id;
+};
+
+struct hmadvise_arg {
+	int hnid;
+	unsigned long start;
+	__kernel_size_t len_in;
+	int behavior;
+};
+
+#define GMEM_GET_HNUMA_ID _IOW(GMEM_MAGIC, _GMEM_GET_HNUMA_ID, struct gmem_hnid_arg)
+#define GMEM_MADVISE _IOW(GMEM_MAGIC, _GMEM_MADVISE, struct hmadvise_arg)
+
+#endif
diff --git a/init/main.c b/init/main.c
index 57e4a74652a775ab124cd6a9a170600e1b893cf3..0b1add6bf4d061f987e4e2169b7bb6b5e3f00fb8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -102,6 +102,10 @@
 #include <linux/randomize_kstack.h>
 #include <net/net_namespace.h>
 
+#ifdef CONFIG_GMEM
+#include <linux/gmem.h>
+#endif
+
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
@@ -908,6 +912,10 @@ asmlinkage __visible void __init __no_sanitize_address __noreturn start_kernel(v
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 	boot_cpu_hotplug_init();
 
+#ifdef CONFIG_GMEM
+	hnuma_init();
+#endif
+
 	pr_notice("Kernel command line: %s\n", saved_command_line);
 	/* parameters may set static keys */
 	jump_label_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index a721784458d9c38a3eca79f6c33b087f6e4f12a5..7e11bdaef25777e8abf98994894d8b6d409bf47f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -100,6 +100,10 @@
 #include <linux/user_events.h>
 #include <linux/iommu.h>
 
+#ifdef CONFIG_GMEM
+#include <linux/vm_object.h>
+#endif
+
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -521,6 +525,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
 
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(orig)) {
+		pr_debug("gmem: peer-shared vma should not be dup\n");
+		new->vm_obj = vm_object_create(new);
+	}
+#endif
+
 	return new;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index f93954a0384d3889fb6d52de977562d10be1a012..69089222d8d8d7d85e766f0ff906caf77d558b6d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -434,6 +434,7 @@ struct task_struct *find_get_task_by_vpid(pid_t nr)
 
 	return task;
 }
+EXPORT_SYMBOL_GPL(find_get_task_by_vpid);
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 7672a22647b4a2434c22bda7b92b3897efc84783..b950407dd87fd9d814b891bad3138967c4c33cb7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1206,6 +1206,15 @@ config PER_VMA_LOCK
 	  This feature allows locking each virtual memory area separately when
 	  handling page faults instead of taking mmap_lock.
 
+config GMEM
+	bool "gmem subsystem for multi-MMU cooperative management"
+	depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE
+	select ARCH_USES_HIGH_VMA_FLAGS
+	select GMEM_DEV
+	default y
+	help
+	  say Y here to enable gmem subsystem
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index e29afc890cde2583124dabb8332be8f0f367313c..0824907eab98e9bc9e8f856c8e071bdd8fcba3cb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,6 +40,7 @@ mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
 			   msync.o page_vma_mapped.o pagewalk.o \
 			   pgtable-generic.o rmap.o vmalloc.o
+mmu-$(CONFIG_GMEM) += gmem.o vm_object.o
 
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/gmem.c b/mm/gmem.c
new file mode 100644
index 0000000000000000000000000000000000000000..d490e84291aff2e1b3a450fd8ca392ad8f9b25c1
--- /dev/null
+++ b/mm/gmem.c
@@ -0,0 +1,763 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generalized Memory Management.
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi Zhu
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/xxhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/hashtable.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+#include <linux/mempolicy.h>
+#include <linux/gmem.h>
+#include <linux/xarray.h>
+#include <linux/syscalls.h>
+#include <linux/dma-mapping.h>
+#include <linux/vm_object.h>
+#include <linux/dma-direct.h>
+#include <linux/workqueue.h>
+#include <linux/proc_fs.h>
+
+DEFINE_STATIC_KEY_FALSE(gmem_status);
+EXPORT_SYMBOL_GPL(gmem_status);
+
+static struct kmem_cache *gm_as_cache;
+static struct kmem_cache *gm_dev_cache;
+static struct kmem_cache *gm_ctx_cache;
+static struct kmem_cache *gm_region_cache;
+static DEFINE_XARRAY_ALLOC(gm_dev_id_pool);
+
+static bool enable_gmem;
+
+static inline unsigned long pe_mask(enum page_entry_size pe_size)
+{
+	if (pe_size == PE_SIZE_PTE)
+		return PAGE_MASK;
+	if (pe_size == PE_SIZE_PMD)
+		return HPAGE_PMD_MASK;
+	if (pe_size == PE_SIZE_PUD)
+		return HPAGE_PUD_MASK;
+	return ~0;
+}
+
+static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS];
+
+void gmem_state_counter(enum gmem_stat_item item, int val)
+{
+	if (!gmem_is_enabled())
+		return;
+
+	if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS)))
+		return;
+
+	percpu_counter_add(&g_gmem_stats[item], val);
+}
+
+#ifdef CONFIG_PROC_FS
+static int gmemstat_show(struct seq_file *m, void *arg)
+{
+	if (!gmem_is_enabled())
+		return 0;
+
+	seq_printf(m, "migrating     : %lld\n",
+		percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING]));
+
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+static struct workqueue_struct *prefetch_wq;
+
+#define GM_WORK_CONCURRENCY 4
+
+static int __init gmem_init(void)
+{
+	int err = -ENOMEM;
+
+	if (!enable_gmem)
+		return 0;
+
+	gm_as_cache = KMEM_CACHE(gm_as, 0);
+	if (!gm_as_cache)
+		goto out;
+
+	gm_dev_cache = KMEM_CACHE(gm_dev, 0);
+	if (!gm_dev_cache)
+		goto free_as;
+
+	gm_ctx_cache = KMEM_CACHE(gm_context, 0);
+	if (!gm_ctx_cache)
+		goto free_dev;
+
+	gm_region_cache = KMEM_CACHE(gm_region, 0);
+	if (!gm_region_cache)
+		goto free_ctx;
+
+	err = vm_object_init();
+	if (err)
+		goto free_ctx;
+
+	prefetch_wq = alloc_workqueue("prefetch",
+		__WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY);
+	if (!prefetch_wq) {
+		pr_info("fail to alloc workqueue prefetch_wq\n");
+		err = -EFAULT;
+		goto free_ctx;
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_create_single("gmemstat", 0444, NULL, gmemstat_show);
+#endif
+
+	static_branch_enable(&gmem_status);
+
+	return 0;
+
+free_ctx:
+	kmem_cache_destroy(gm_ctx_cache);
+free_dev:
+	kmem_cache_destroy(gm_dev_cache);
+free_as:
+	kmem_cache_destroy(gm_as_cache);
+out:
+	return -ENOMEM;
+}
+subsys_initcall(gmem_init);
+
+static int __init setup_gmem(char *str)
+{
+	strtobool(str, &enable_gmem);
+
+	return 1;
+}
+__setup("gmem=", setup_gmem);
+
+/*
+ * Create a GMEM device, register its MMU function and the page table.
+ * The returned device pointer will be passed by new_dev.
+ * A unique id will be assigned to the GMEM device, using Linux's xarray.
+ */
+gm_ret_t gm_dev_create(gm_mmu_t *mmu, void *dev_data, gm_dev_cap_t cap, gm_dev_t **new_dev)
+{
+	gm_dev_t *dev;
+
+	if (!gmem_is_enabled())
+		return GM_RET_FAILURE_UNKNOWN;
+
+	dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL);
+	if (!dev)
+		return GM_RET_NOMEM;
+
+	if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, GFP_KERNEL)) {
+		kmem_cache_free(gm_dev_cache, dev);
+		return GM_RET_NOMEM;
+	}
+
+	dev->capability = cap;
+	dev->mmu = mmu;
+	dev->dev_data = dev_data;
+	dev->current_ctx = NULL;
+	INIT_LIST_HEAD(&dev->gm_ctx_list);
+	*new_dev = dev;
+	nodes_clear(dev->registered_hnodes);
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_dev_create);
+
+/* Handle the page fault triggered by a given device */
+gm_ret_t gm_dev_fault(struct mm_struct *mm, gm_va_t addr, gm_dev_t *dev, int behavior)
+{
+	gm_ret_t ret = GM_RET_SUCCESS;
+	gm_mmu_t *mmu = dev->mmu;
+	struct device *dma_dev = dev->dma_dev;
+	struct vm_area_struct *vma;
+	vm_object_t *obj;
+	gm_mapping_t *gm_mapping;
+	gm_va_t size = HPAGE_SIZE;
+	struct gm_fault_t gmf = {
+		.mm = mm,
+		.va = addr,
+		.dev = dev,
+		.size = size,
+		.copy = false,
+		.behavior = behavior
+	};
+	struct page *page = NULL;
+
+	mmap_read_lock(mm);
+
+	vma = find_vma(mm, addr);
+	if (!vma) {
+		pr_info("gmem: %s no vma\n", __func__);
+		ret = GM_RET_FAILURE_UNKNOWN;
+		goto mmap_unlock;
+	}
+	obj = vma->vm_obj;
+	if (!obj) {
+		pr_info("gmem: %s no vm_obj\n", __func__);
+		ret = GM_RET_FAILURE_UNKNOWN;
+		goto mmap_unlock;
+	}
+
+	xa_lock(obj->logical_page_table);
+	gm_mapping = vm_object_lookup(obj, addr);
+	if (!gm_mapping) {
+		vm_object_mapping_create(obj, addr);
+		gm_mapping = vm_object_lookup(obj, addr);
+	}
+	xa_unlock(obj->logical_page_table);
+
+	mutex_lock(&gm_mapping->lock);
+	if (gm_mapping_nomap(gm_mapping)) {
+		goto peer_map;
+	} else if (gm_mapping_device(gm_mapping)) {
+		if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) {
+			goto peer_map;
+		} else {
+			ret = 0;
+			goto unlock;
+		}
+	} else if (gm_mapping_cpu(gm_mapping)) {
+		page = gm_mapping->page;
+		if (!page) {
+			pr_err("gmem: host gm_mapping page is NULL. Set nomap\n");
+			set_gm_mapping_nomap(gm_mapping);
+			goto unlock;
+		}
+		get_page(page);
+		zap_page_range_single(vma, addr, size, NULL);
+		gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(dma_dev, gmf.dma_addr))
+			pr_info("gmem: dma map failed\n");
+
+		gmf.copy = true;
+	}
+
+peer_map:
+	ret = mmu->peer_map(&gmf);
+	if (ret != GM_RET_SUCCESS) {
+		if (ret == GM_RET_MIGRATING) {
+			/*
+			 * gmem page is migrating due to overcommit.
+			 * update page to willneed and this will stop page evicting
+			 */
+			set_gm_mapping_willneed(gm_mapping);
+			gmem_state_counter(NR_PAGE_MIGRATING, 1);
+			ret = GM_RET_SUCCESS;
+		} else {
+			pr_err("gmem: peer map failed\n");
+			if (page) {
+				set_gm_mapping_nomap(gm_mapping);
+				put_page(page);
+			}
+		}
+		goto unlock;
+	}
+
+	if (page) {
+		dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL);
+		put_page(page);
+	}
+
+	set_gm_mapping_device(gm_mapping, dev);
+unlock:
+	mutex_unlock(&gm_mapping->lock);
+mmap_unlock:
+	mmap_read_unlock(mm);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gm_dev_fault);
+
+vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size)
+{
+	vm_fault_t ret = 0;
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address & pe_mask(pe_size);
+	vm_object_t *obj = vma->vm_obj;
+	gm_mapping_t *gm_mapping;
+	gm_va_t size = HPAGE_SIZE;
+	gm_dev_t *dev;
+	struct device *dma_dev;
+	struct gm_fault_t gmf = {
+		.mm = vma->vm_mm,
+		.va = addr,
+		.size = size,
+		.copy = true,
+	};
+
+	gm_mapping = vm_object_lookup(obj, addr);
+	if (!gm_mapping) {
+		pr_err("gmem: host fault gm_mapping should not be NULL\n");
+		return VM_FAULT_SIGBUS;
+	}
+
+	dev = gm_mapping->dev;
+	gmf.dev = dev;
+	dma_dev = dev->dma_dev;
+	gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(dma_dev, gmf.dma_addr)) {
+		pr_err("gmem: host fault dma mapping error\n");
+		return VM_FAULT_SIGBUS;
+	}
+	if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) {
+		pr_err("gmem: peer unmap failed\n");
+		dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL);
+		return VM_FAULT_SIGBUS;
+	}
+
+	dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL);
+	return ret;
+}
+
+/*
+ * Register the local physical memory of a gmem device.
+ * This implies dynamically creating
+ * the struct page data structures.
+ */
+gm_ret_t gm_dev_register_physmem(gm_dev_t *dev, gm_pa_t begin, gm_pa_t end)
+{
+	gm_mapping_t *mapping;
+	gm_pa_t addr = PAGE_ALIGN(begin);
+	unsigned int nid;
+	int i, page_num = (end - addr) >> PAGE_SHIFT;
+	struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL);
+
+	if (!hnode)
+		goto err;
+
+	nid = alloc_hnode_id();
+	if (nid == MAX_NUMNODES)
+		goto free_hnode;
+	hnode_init(hnode, nid, dev);
+
+	mapping = kvmalloc(sizeof(gm_mapping_t) * page_num, GFP_KERNEL);
+	if (!mapping)
+		goto deinit_hnode;
+
+	for (i = 0; i < page_num; i++, addr += PAGE_SIZE) {
+		mapping[i].node_id = hnode->id;
+		mapping[i].pfn = addr >> PAGE_SHIFT;
+		mapping[i].flag = 0;
+	}
+
+	xa_lock(&hnode->pages);
+	for (i = 0; i < page_num; i++) {
+		if (xa_err(__xa_store(&hnode->pages, i, mapping + i, GFP_KERNEL))) {
+			/* Probably nomem */
+			kvfree(mapping);
+			xa_unlock(&hnode->pages);
+			goto deinit_hnode;
+		}
+		__xa_set_mark(&hnode->pages, i, XA_MARK_0);
+	}
+	xa_unlock(&hnode->pages);
+
+	return GM_RET_SUCCESS;
+
+deinit_hnode:
+	hnode_deinit(nid, dev);
+	free_hnode_id(nid);
+free_hnode:
+	kfree(hnode);
+err:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(gm_dev_register_physmem);
+
+void gm_dev_unregister_physmem(gm_dev_t *dev, unsigned int nid)
+{
+	struct hnode *hnode = get_hnode(nid);
+	gm_mapping_t *mapping = xa_load(&hnode->pages, 0);
+
+	kvfree(mapping);
+	hnode_deinit(nid, dev);
+	free_hnode_id(nid);
+	kfree(hnode);
+}
+EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem);
+
+gm_mapping_t *gm_mappings_alloc(unsigned int nid, unsigned int order)
+{
+	gm_mapping_t *mapping;
+	struct hnode *node = get_hnode(nid);
+	XA_STATE(xas, &node->pages, 0);
+
+	/* TODO: support order > 0 */
+	if (order != 0)
+		return ERR_PTR(-EINVAL);
+
+	xa_lock(&node->pages);
+	mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0);
+	if (!mapping) {
+		xa_unlock(&node->pages);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	xas_clear_mark(&xas, XA_MARK_0);
+	xa_unlock(&node->pages);
+
+	return mapping;
+}
+EXPORT_SYMBOL_GPL(gm_mappings_alloc);
+
+void gm_mappings_free(gm_mapping_t *mapping, unsigned int order)
+{
+	gm_mapping_t *entry;
+	struct hnode *node = get_hnode(mapping->node_id);
+	XA_STATE(xas, &node->pages, 0);
+
+	/* TODO: support order > 0 */
+	if (order != 0)
+		return;
+
+	xas_for_each(&xas, entry, ULONG_MAX) {
+		if (entry == mapping) {
+			xas_set_mark(&xas, XA_MARK_0);
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(gm_mappings_free);
+
+/* GMEM Virtual Address Space API */
+gm_ret_t gm_as_create(gm_va_t begin, gm_va_t end, gm_as_alloc_t policy,
+		gm_va_t cache_quantum, gm_as_t **new_as)
+{
+	gm_as_t *as;
+
+	if (!new_as)
+		return -EINVAL;
+
+	as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC);
+	if (!as)
+		return -ENOMEM;
+
+	spin_lock_init(&as->rbtree_lock);
+	as->rbroot = RB_ROOT;
+	as->start_va = begin;
+	as->end_va = end;
+	as->policy = policy;
+
+	INIT_LIST_HEAD(&as->gm_ctx_list);
+
+	*new_as = as;
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_as_create);
+
+gm_ret_t gm_as_destroy(gm_as_t *as)
+{
+	gm_context_t *ctx, *tmp_ctx;
+
+	list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link)
+		kfree(ctx);
+
+	kmem_cache_free(gm_as_cache, as);
+
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_as_destroy);
+
+gm_ret_t gm_as_attach(gm_as_t *as, gm_dev_t *dev, gm_mmu_mode_t mode,
+		bool activate, gm_context_t **out_ctx)
+{
+	gm_context_t *ctx;
+	int nid;
+	int ret;
+
+	ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL);
+	if (!ctx)
+		return GM_RET_NOMEM;
+
+	ctx->as = as;
+	ctx->dev = dev;
+	ctx->pmap = NULL;
+	ret = dev->mmu->pmap_create(dev, &ctx->pmap);
+	if (ret) {
+		kmem_cache_free(gm_ctx_cache, ctx);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&ctx->gm_dev_link);
+	INIT_LIST_HEAD(&ctx->gm_as_link);
+	list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link);
+	list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list);
+
+	if (activate) {
+		/*
+		 * Here we should really have a callback function to perform the context switch
+		 * for the hardware. E.g. in x86 this function is effectively flushing the CR3 value.
+		 * Currently we do not care time-sliced context switch, unless someone wants to support it.
+		 */
+		dev->current_ctx = ctx;
+	}
+	*out_ctx = ctx;
+
+	/*
+	 * gm_as_attach will be used to attach device to process address space.
+	 * Handle this case and add hnodes registered by device to process mems_allowed.
+	 */
+	for_each_node_mask(nid, dev->registered_hnodes)
+		node_set(nid, current->mems_allowed);
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_as_attach);
+
+DEFINE_SPINLOCK(hnode_lock);
+struct hnode *hnodes[MAX_NUMNODES];
+
+void __init hnuma_init(void)
+{
+	unsigned int node;
+
+	for_each_node(node)
+		node_set(node, hnode_map);
+}
+
+unsigned int alloc_hnode_id(void)
+{
+	unsigned int node;
+
+	spin_lock(&hnode_lock);
+	node = first_unset_node(hnode_map);
+	node_set(node, hnode_map);
+	spin_unlock(&hnode_lock);
+
+	return node;
+}
+
+void free_hnode_id(unsigned int nid)
+{
+	node_clear(nid, hnode_map);
+}
+
+void hnode_init(struct hnode *hnode, unsigned int hnid, gm_dev_t *dev)
+{
+	hnodes[hnid] = hnode;
+	hnodes[hnid]->id = hnid;
+	hnodes[hnid]->dev = dev;
+	node_set(hnid, dev->registered_hnodes);
+	xa_init(&hnodes[hnid]->pages);
+}
+
+void hnode_deinit(unsigned int hnid, gm_dev_t *dev)
+{
+	hnodes[hnid]->id = 0;
+	hnodes[hnid]->dev = NULL;
+	node_clear(hnid, dev->registered_hnodes);
+	xa_destroy(&hnodes[hnid]->pages);
+	hnodes[hnid] = NULL;
+}
+
+struct prefetch_data {
+	struct mm_struct *mm;
+	gm_dev_t *dev;
+	unsigned long addr;
+	size_t size;
+	struct work_struct work;
+	int *res;
+};
+
+static void prefetch_work_cb(struct work_struct *work)
+{
+	struct prefetch_data *d =
+		container_of(work, struct prefetch_data, work);
+	unsigned long addr = d->addr, end = d->addr + d->size;
+	int page_size = HPAGE_SIZE;
+	int ret;
+
+	do {
+		/* MADV_WILLNEED: dev will soon access this addr. */
+		ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED);
+		if (ret == GM_RET_PAGE_EXIST) {
+			pr_info("%s: device has done page fault, ignore prefetch\n", __func__);
+		} else if (ret != GM_RET_SUCCESS) {
+			*d->res = -EFAULT;
+			pr_err("%s: call dev fault error %d\n", __func__, ret);
+		}
+	} while (addr += page_size, addr != end);
+
+	kfree(d);
+}
+
+static int hmadvise_do_prefetch(gm_dev_t *dev, unsigned long addr, size_t size)
+{
+	unsigned long start, end, per_size;
+	int page_size = HPAGE_SIZE;
+	struct prefetch_data *data;
+	struct vm_area_struct *vma;
+	int res = GM_RET_SUCCESS;
+
+	/* Align addr by rounding outward to make page cover addr. */
+	end = round_up(addr + size, page_size);
+	start = round_down(addr, page_size);
+	size = end - start;
+
+	mmap_read_lock(current->mm);
+	vma = find_vma(current->mm, start);
+	if (!vma || start < vma->vm_start || end > vma->vm_end) {
+		mmap_read_unlock(current->mm);
+		return GM_RET_FAILURE_UNKNOWN;
+	}
+	mmap_read_unlock(current->mm);
+
+	per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1);
+
+	while (start < end) {
+		data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL);
+		if (!data) {
+			flush_workqueue(prefetch_wq);
+			return GM_RET_NOMEM;
+		}
+
+		INIT_WORK(&data->work, prefetch_work_cb);
+		data->mm = current->mm;
+		data->dev = dev;
+		data->addr = start;
+		data->res = &res;
+		if (per_size == 0)
+			data->size = size;
+		else
+			/* Process (1.x * per_size) for the last time */
+			data->size = (end - start < 2 * per_size) ? (end - start) : per_size;
+		queue_work(prefetch_wq, &data->work);
+		start += data->size;
+	}
+
+	flush_workqueue(prefetch_wq);
+	return res;
+}
+
+static int hmadvise_do_eagerfree(unsigned long addr, size_t size)
+{
+	int page_size = HPAGE_SIZE;
+	struct vm_area_struct *vma;
+	int ret = GM_RET_SUCCESS;
+	unsigned long start, end;
+	gm_mapping_t *gm_mapping;
+	struct gm_fault_t gmf = {
+		.mm = current->mm,
+		.size = page_size,
+		.copy = false,
+	};
+	vm_object_t *obj;
+
+	/* Align addr by rounding inward to avoid excessive page release. */
+	end = round_down(addr + size, page_size);
+	start = round_up(addr, page_size);
+	if (start >= end)
+		return ret;
+
+	mmap_read_lock(current->mm);
+	do {
+		vma = find_vma(current->mm, start);
+		if (!vma || !vma_is_peer_shared(vma)) {
+			pr_err("gmem: not peer-shared vma, skip dontneed\n");
+			continue;
+		}
+		obj = vma->vm_obj;
+		if (!obj) {
+			pr_err("gmem: peer-shared vma should have vm_object\n");
+			mmap_read_unlock(current->mm);
+			return -EINVAL;
+		}
+		xa_lock(obj->logical_page_table);
+		gm_mapping = vm_object_lookup(obj, start);
+		if (!gm_mapping) {
+			xa_unlock(obj->logical_page_table);
+			continue;
+		}
+		xa_unlock(obj->logical_page_table);
+		mutex_lock(&gm_mapping->lock);
+		if (gm_mapping_nomap(gm_mapping)) {
+			mutex_unlock(&gm_mapping->lock);
+			continue;
+		} else if (gm_mapping_cpu(gm_mapping)) {
+			zap_page_range_single(vma, start, page_size, NULL);
+		} else {
+			gmf.va = start;
+			gmf.dev = gm_mapping->dev;
+			ret = gm_mapping->dev->mmu->peer_unmap(&gmf);
+			if (ret) {
+				pr_err("gmem: peer_unmap failed. ret %d\n", ret);
+				mutex_unlock(&gm_mapping->lock);
+				continue;
+			}
+		}
+		set_gm_mapping_nomap(gm_mapping);
+		mutex_unlock(&gm_mapping->lock);
+	} while (start += page_size, start != end);
+
+	mmap_read_unlock(current->mm);
+	return ret;
+}
+
+static bool check_hmadvise_behavior(int behavior)
+{
+	return behavior == MADV_DONTNEED;
+}
+
+int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior)
+{
+	int error = -EINVAL;
+	struct hnode *node;
+
+	if (hnid == -1) {
+		if (check_hmadvise_behavior(behavior)) {
+			goto no_hnid;
+		} else {
+			pr_err("hmadvise: behavior %d need hnid or is invalid\n",
+				behavior);
+			return error;
+		}
+	}
+
+	if (hnid < 0)
+		return error;
+
+	if (!is_hnode(hnid) || !is_hnode_allowed(hnid))
+		return error;
+
+	node = get_hnode(hnid);
+	if (!node) {
+		pr_err("hmadvise: hnode id %d is invalid\n", hnid);
+		return error;
+	}
+
+no_hnid:
+	switch (behavior) {
+	case MADV_PREFETCH:
+		return hmadvise_do_prefetch(node->dev, start, len_in);
+	case MADV_DONTNEED:
+		return hmadvise_do_eagerfree(start, len_in);
+	default:
+		pr_err("hmadvise: unsupported behavior %d\n", behavior);
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(hmadvise_inner);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 624671aaa60d0e0fdd551d7d14a039ecc9e2af7b..a55c88ba305df4956b4324ec2be71cc892fc7bff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,6 +37,9 @@
 #include <linux/page_owner.h>
 #include <linux/sched/sysctl.h>
 #include <linux/memory-tiers.h>
+#ifdef CONFIG_GMEM
+#include <linux/vm_object.h>
+#endif
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -656,6 +659,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 	pgtable_t pgtable;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 	vm_fault_t ret = 0;
+#ifdef CONFIG_GMEM
+	gm_mapping_t *gm_mapping = NULL;
+
+	if (vma_is_peer_shared(vma))
+		gm_mapping = vm_object_lookup(vma->vm_obj, haddr);
+#endif
 
 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 
@@ -663,7 +672,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		folio_put(folio);
 		count_vm_event(THP_FAULT_FALLBACK);
 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
-		return VM_FAULT_FALLBACK;
+		ret = VM_FAULT_FALLBACK;
+		goto gm_mapping_release;
 	}
 	folio_throttle_swaprate(folio, gfp);
 
@@ -673,7 +683,16 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		goto release;
 	}
 
+#ifdef CONFIG_GMEM
+	/*
+	 * gmem device overcommit needs to reload the swapped page,
+	 * so skip it to avoid clearing device data.
+	 */
+	if (!vma_is_peer_shared(vma) || !gm_mapping_cpu(gm_mapping))
+		clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+#else
 	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+#endif
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * clear_huge_page writes become visible before the set_pmd_at()
@@ -698,7 +717,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 			pte_free(vma->vm_mm, pgtable);
 			ret = handle_userfault(vmf, VM_UFFD_MISSING);
 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
-			return ret;
+			goto gm_mapping_release;
 		}
 
 		entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -706,6 +725,14 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		folio_add_new_anon_rmap(folio, vma, haddr);
 		folio_add_lru_vma(folio, vma);
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+#ifdef CONFIG_GMEM
+		if (vma_is_peer_shared(vma) && gm_mapping_device(gm_mapping)) {
+			vmf->page = page;
+			ret = gm_host_fault_locked(vmf, PE_SIZE_PMD);
+			if (ret)
+				goto unlock_release;
+		}
+#endif
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -713,6 +740,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+#ifdef CONFIG_GMEM
+		if (vma_is_peer_shared(vma)) {
+			set_gm_mapping_host(gm_mapping, page);
+			mutex_unlock(&gm_mapping->lock);
+		}
+#endif
 	}
 
 	return 0;
@@ -722,6 +755,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 	if (pgtable)
 		pte_free(vma->vm_mm, pgtable);
 	folio_put(folio);
+gm_mapping_release:
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(vma))
+		mutex_unlock(&gm_mapping->lock);
+#endif
 	return ret;
 
 }
@@ -780,17 +818,41 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	gfp_t gfp;
-	struct folio *folio;
+	struct folio *folio = NULL;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	vm_fault_t ret = 0;
+#ifdef CONFIG_GMEM
+	gm_mapping_t *gm_mapping;
+
+	if (vma_is_peer_shared(vma)) {
+		xa_lock(vma->vm_obj->logical_page_table);
+		gm_mapping = vm_object_lookup(vma->vm_obj, haddr);
+		if (!gm_mapping) {
+			vm_object_mapping_create(vma->vm_obj, haddr);
+			gm_mapping = vm_object_lookup(vma->vm_obj, haddr);
+		}
+		xa_unlock(vma->vm_obj->logical_page_table);
+		mutex_lock(&gm_mapping->lock);
+		if (unlikely(!pmd_none(*vmf->pmd))) {
+			mutex_unlock(&gm_mapping->lock);
+			goto gm_mapping_release;
+		}
+	}
+#endif
 
-	if (!transhuge_vma_suitable(vma, haddr))
-		return VM_FAULT_FALLBACK;
-	if (unlikely(anon_vma_prepare(vma)))
-		return VM_FAULT_OOM;
+	if (!transhuge_vma_suitable(vma, haddr)) {
+		ret = VM_FAULT_FALLBACK;
+		goto gm_mapping_release;
+	}
+	if (unlikely(anon_vma_prepare(vma))) {
+		ret = VM_FAULT_OOM;
+		goto gm_mapping_release;
+	}
 	khugepaged_enter_vma(vma, vma->vm_flags);
 
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 			!mm_forbids_zeropage(vma->vm_mm) &&
+			!vma_is_peer_shared(vma) &&
 			transparent_hugepage_use_zero_page()) {
 		pgtable_t pgtable;
 		struct page *zero_page;
@@ -829,12 +891,32 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		return ret;
 	}
 	gfp = vma_thp_gfp_mask(vma);
+
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(vma) && gm_mapping_cpu(gm_mapping))
+		folio = page_folio(gm_mapping->page);
+	if (!folio) {
+		if (vma_is_peer_shared(vma))
+			gfp = GFP_TRANSHUGE;
+		folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+	}
+#else
 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+#endif
+
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
-		return VM_FAULT_FALLBACK;
+		ret = VM_FAULT_FALLBACK;
+		goto gm_mapping_release;
 	}
 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+gm_mapping_release:
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(vma))
+		mutex_unlock(&gm_mapping->lock);
+#endif
+	return ret;
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc2511984e224ab31f38a6315404b5d902b1..ed759cf1250b3e3e9dc45b7dcbdec4310790121d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,6 +77,9 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#ifdef CONFIG_GMEM
+#include <linux/vm_object.h>
+#endif
 
 #include <trace/events/kmem.h>
 
@@ -1522,6 +1525,47 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	return addr;
 }
 
+#ifdef CONFIG_GMEM
+static inline void zap_logic_pmd_range(struct vm_area_struct *vma,
+					unsigned long addr,
+					unsigned long end)
+{
+	gm_mapping_t *gm_mapping = NULL;
+	struct page *page = NULL;
+
+	xa_lock(vma->vm_obj->logical_page_table);
+	gm_mapping = vm_object_lookup(vma->vm_obj, addr);
+
+	if (gm_mapping && gm_mapping_cpu(gm_mapping)) {
+		page = gm_mapping->page;
+		if (page && (page_ref_count(page) != 0)) {
+			put_page(page);
+			gm_mapping->page = NULL;
+		}
+	}
+	xa_unlock(vma->vm_obj->logical_page_table);
+}
+
+static inline void zap_logic_pud_range(struct vm_area_struct *vma,
+					unsigned long addr,
+					unsigned long end)
+{
+	unsigned long next;
+
+	do {
+		next = pmd_addr_end(addr, end);
+		zap_logic_pmd_range(vma, addr, next);
+	} while (addr = next, addr != end);
+}
+#else
+static inline void zap_logic_pmd_range(struct vm_area_struct *vma,
+					unsigned long addr,
+					unsigned long end) {}
+static inline void zap_logic_pud_range(struct vm_area_struct *vma,
+					unsigned long addr,
+					unsigned long end) {}
+#endif
+
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
@@ -1558,8 +1602,12 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 		 * because MADV_DONTNEED holds the mmap_lock in read
 		 * mode.
 		 */
-		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+		if (pmd_none_or_trans_huge_or_clear_bad(pmd)) {
+			if (vma_is_peer_shared(vma))
+				zap_logic_pmd_range(vma, addr, next);
 			goto next;
+		}
+
 		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
 next:
 		cond_resched();
@@ -1587,8 +1635,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				goto next;
 			/* fall through */
 		}
-		if (pud_none_or_clear_bad(pud))
+		if (pud_none_or_clear_bad(pud)) {
+			if (vma_is_peer_shared(vma))
+				zap_logic_pud_range(vma, addr, next);
 			continue;
+		}
 		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
 next:
 		cond_resched();
@@ -1608,8 +1659,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
 	p4d = p4d_offset(pgd, addr);
 	do {
 		next = p4d_addr_end(addr, end);
-		if (p4d_none_or_clear_bad(p4d))
+		if (p4d_none_or_clear_bad(p4d)) {
+			if (vma_is_peer_shared(vma))
+				zap_logic_pud_range(vma, addr, next);
 			continue;
+		}
 		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
 	} while (p4d++, addr = next, addr != end);
 
@@ -1629,8 +1683,11 @@ void unmap_page_range(struct mmu_gather *tlb,
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
+		if (pgd_none_or_clear_bad(pgd)) {
+			if (vma_is_peer_shared(vma))
+				zap_logic_pud_range(vma, addr, next);
 			continue;
+		}
 		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 40985c9d92d05217f42715ce4209ceb01c6b6bd5..5ed13fe2bd75397745ff60fb28ebcbe604644a92 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1716,7 +1716,11 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 
 bool vma_migratable(struct vm_area_struct *vma)
 {
+#ifdef CONFIG_GMEM
+	if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED))
+#else
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+#endif
 		return false;
 
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d600404580b2820183994fc44650a3e8344d4985..061cc7381233214a85fb3f79aabed087b6f69b13 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -48,6 +48,10 @@
 #include <linux/sched/mm.h>
 #include <linux/ksm.h>
 
+#ifdef CONFIG_GMEM
+#include <linux/vm_object.h>
+#endif
+
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
@@ -647,6 +651,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst,
 	 * anon pages imported.
 	 */
 	if (src->anon_vma && !dst->anon_vma) {
+#ifdef CONFIG_GMEM
+		if (vma_is_peer_shared(dst))
+			dup_vm_object(dst, src);
+#endif
 		dst->anon_vma = src->anon_vma;
 		return anon_vma_clone(dst, src);
 	}
@@ -754,6 +762,41 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	return 0;
 }
 
+#ifdef CONFIG_GMEM
+struct gmem_vma_list {
+	struct vm_area_struct *vma;
+	struct list_head list;
+};
+
+void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head)
+{
+	struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL);
+
+	if (!node) {
+		pr_err("%s: fail to alloc memory\n", __func__);
+		return;
+	}
+
+	node->vma = value;
+	list_add_tail(&node->list, head);
+}
+
+void gmem_release_vma(struct mm_struct *mm, struct list_head *head)
+{
+	struct gmem_vma_list *node, *next;
+
+	list_for_each_entry_safe(node, next, head, list) {
+		struct vm_area_struct *vma = node->vma;
+
+		if (vma != NULL)
+			vm_area_free(vma);
+
+		list_del(&node->list);
+		kfree(node);
+	}
+}
+#endif
+
 /*
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those if the caller indicates
@@ -1041,6 +1084,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
 		vma_iter_store(vmi, vma);
 
 	if (adj_start) {
+#ifdef CONFIG_GMEM
+		if (vma_is_peer_shared(adjust))
+			vm_object_adjust(adjust, adjust->vm_start + adj_start,
+				adjust->vm_end);
+#endif
 		adjust->vm_start += adj_start;
 		adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
 		if (adj_start < 0) {
@@ -1267,7 +1315,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
+#ifdef CONFIG_GMEM
+	if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) {
+		len = round_up(len, SZ_2M);
+		addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags,
+					SZ_2M);
+	} else {
+		addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	}
+#else
 	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+#endif
 	if (IS_ERR_VALUE(addr))
 		return addr;
 
@@ -1391,6 +1449,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
+#ifdef CONFIG_GMEM
+	if (gmem_is_enabled() && (flags & MAP_PEER_SHARED))
+		vm_flags |= VM_PEER_SHARED;
+#endif
+
 	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
@@ -1827,6 +1890,27 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 
 EXPORT_SYMBOL(get_unmapped_area);
 
+unsigned long
+get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, unsigned long align)
+{
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
+	addr = round_up(addr, align);
+	if (addr > TASK_SIZE - len)
+		return -ENOMEM;
+	if (!IS_ALIGNED(addr, PMD_SIZE))
+		return -EINVAL;
+
+	return addr;
+}
+EXPORT_SYMBOL(get_unmapped_area_aligned);
+
 /**
  * find_vma_intersection() - Look up the first VMA which intersects the interval
  * @mm: The process address space.
@@ -2268,6 +2352,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	if (err)
 		goto out_free_mpol;
 
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(vma))
+		dup_vm_object(new, vma);
+#endif
+
 	if (new->vm_file)
 		get_file(new->vm_file);
 
@@ -2279,6 +2368,18 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	vma_prepare(&vp);
 	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
 
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(vma)) {
+		if (new_below) {
+			vm_object_adjust(new, new->vm_start, addr);
+			vm_object_adjust(vma, addr, vma->vm_end);
+		} else {
+			vm_object_adjust(vma, vma->vm_start, addr);
+			vm_object_adjust(new, addr, new->vm_end);
+		}
+	}
+#endif
+
 	if (new_below) {
 		vma->vm_start = addr;
 		vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
@@ -2318,6 +2419,72 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	return __split_vma(vmi, vma, addr, new_below);
 }
 
+#ifdef CONFIG_GMEM
+static void munmap_in_peer_devices(struct mm_struct *mm,
+	struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+	unsigned long addr = start;
+	vm_object_t *obj = vma->vm_obj;
+	gm_ret_t ret;
+	gm_context_t *ctx, *tmp;
+	gm_mapping_t *gm_mapping;
+
+	struct gm_fault_t gmf = {
+		.mm = mm,
+		.copy = false,
+	};
+
+	if (!obj)
+		return;
+
+	do {
+		xa_lock(obj->logical_page_table);
+		gm_mapping = vm_object_lookup(obj, addr);
+		if (!gm_mapping) {
+			xa_unlock(obj->logical_page_table);
+			continue;
+		}
+		xa_unlock(obj->logical_page_table);
+
+		mutex_lock(&gm_mapping->lock);
+		if (!gm_mapping_device(gm_mapping)) {
+			mutex_unlock(&gm_mapping->lock);
+			continue;
+		}
+
+		gmf.va = addr;
+		gmf.size = HPAGE_SIZE;
+		gmf.dev = gm_mapping->dev;
+		ret = gm_mapping->dev->mmu->peer_unmap(&gmf);
+		if (ret != GM_RET_SUCCESS) {
+			pr_err("%s: call dev peer_unmap error %d\n", __func__, ret);
+			mutex_unlock(&gm_mapping->lock);
+			continue;
+		}
+		mutex_unlock(&gm_mapping->lock);
+	} while (addr += HPAGE_SIZE, addr != end);
+
+	if (!mm->gm_as)
+		return;
+
+	list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) {
+		if (!gm_dev_is_peer(ctx->dev))
+			continue;
+		if (!ctx->dev->mmu->peer_va_free)
+			continue;
+
+		gmf.va = start;
+		gmf.size = end - start;
+		gmf.dev = ctx->dev;
+
+		ret = ctx->dev->mmu->peer_va_free(&gmf);
+		if (ret != GM_RET_SUCCESS)
+			pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n",
+				start, end - start, ret);
+	}
+}
+#endif
+
 /*
  * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
  * @vmi: The vma iterator
@@ -2401,6 +2568,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	}
 
 	next = vma_next(vmi);
+
+#ifdef CONFIG_GMEM
+	if (gmem_is_enabled())
+		munmap_in_peer_devices(mm, vma, start, end);
+#endif
+
 	if (unlikely(uf)) {
 		/*
 		 * If userfaultfd_unmap_prep returns an error the vmas
@@ -2509,6 +2682,18 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	unsigned long end;
 	struct vm_area_struct *vma;
 
+	if (gmem_is_enabled()) {
+		vma = find_vma(mm, start);
+		if (!vma)
+			return 0;
+		if (vma_is_peer_shared(vma)) {
+			if (!IS_ALIGNED(start, PMD_SIZE))
+				return -EINVAL;
+
+			len = round_up(len, SZ_2M);
+		}
+	}
+
 	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
 		return -EINVAL;
 
@@ -2541,6 +2726,57 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 	return do_vmi_munmap(&vmi, mm, start, len, uf, false);
 }
 
+#ifdef CONFIG_GMEM
+static int alloc_va_in_peer_devices(struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long addr, unsigned long len,
+		vm_flags_t vm_flags)
+{
+	gm_context_t *ctx, *tmp;
+	gm_prot_t prot = VM_NONE;
+	gm_ret_t ret;
+	struct gm_fault_t gmf = {
+		.mm = mm,
+		.va = addr,
+		.size = len,
+		.prot = prot,
+	};
+
+	pr_debug("gmem: start mmap, as %p\n", mm->gm_as);
+	if (!mm->gm_as)
+		return -ENODEV;
+
+	prot |= vm_flags;
+	if (!vma->vm_obj)
+		vma->vm_obj = vm_object_create(vma);
+	if (!vma->vm_obj)
+		return -ENOMEM;
+	/*
+	 * TODO: consider the concurrency problem of device
+	 * attaching/detaching from the gm_as.
+	 */
+	list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) {
+		if (!gm_dev_is_peer(ctx->dev))
+			continue;
+
+		if (!ctx->dev->mmu->peer_va_alloc_fixed) {
+			pr_debug("gmem: mmu ops has no alloc_vma\n");
+			continue;
+		}
+
+		gmf.dev = ctx->dev;
+
+		pr_debug("gmem: call vma_alloc\n");
+		ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf);
+		if (ret != GM_RET_SUCCESS) {
+			pr_debug("gmem: alloc_vma ret %d\n", ret);
+			return ret;
+		}
+	}
+
+	return GM_RET_SUCCESS;
+}
+#endif
+
 unsigned long mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
 		struct list_head *uf)
@@ -2555,6 +2791,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	pgoff_t vm_pgoff;
 	int error;
 	VMA_ITERATOR(vmi, mm, addr);
+#ifdef CONFIG_GMEM
+	unsigned int retry_times = 0;
+	LIST_HEAD(reserve_list);
+
+retry:
+#endif
 
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
@@ -2567,21 +2809,33 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		nr_pages = count_vma_pages_range(mm, addr, end);
 
 		if (!may_expand_vm(mm, vm_flags,
-					(len >> PAGE_SHIFT) - nr_pages))
+					(len >> PAGE_SHIFT) - nr_pages)) {
+#ifdef CONFIG_GMEM
+			gmem_release_vma(mm, &reserve_list);
+#endif
 			return -ENOMEM;
+		}
 	}
 
 	/* Unmap any existing mapping in the area */
-	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
+	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) {
+#ifdef CONFIG_GMEM
+		gmem_release_vma(mm, &reserve_list);
+#endif
 		return -ENOMEM;
+	}
 
 	/*
 	 * Private writable mapping: check memory availability
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
-		if (security_vm_enough_memory_mm(mm, charged))
+		if (security_vm_enough_memory_mm(mm, charged)) {
+#ifdef CONFIG_GMEM
+			gmem_release_vma(mm, &reserve_list);
+#endif
 			return -ENOMEM;
+		}
 		vm_flags |= VM_ACCOUNT;
 	}
 
@@ -2736,6 +2990,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	file = vma->vm_file;
 	ksm_add_vma(vma);
 expanded:
+#ifdef CONFIG_GMEM
+	if (vma_is_peer_shared(vma)) {
+		gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags);
+
+		if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) {
+			retry_times++;
+			addr = get_unmapped_area(file, addr, len, pgoff, 0);
+			gmem_reserve_vma(vma, &reserve_list);
+			goto retry;
+		} else if (ret != GM_RET_SUCCESS) {
+			pr_debug("gmem: alloc_vma ret %d\n", ret);
+			error = -ENOMEM;
+			goto free_vma;
+		}
+		gmem_release_vma(mm, &reserve_list);
+	}
+#endif
 	perf_event_mmap(vma);
 
 	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
@@ -2785,6 +3056,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
+#ifdef CONFIG_GMEM
+	gmem_release_vma(mm, &reserve_list);
+#endif
 	validate_mm(mm);
 	return error;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47421bedc12b7a99db2ac595f3e33fc20999aecf..90762bee97306f48a3d5329b9b2292fb3a3c953c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -216,6 +216,9 @@ EXPORT_SYMBOL(latent_entropy);
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
+#ifdef CONFIG_GMEM
+	[N_HETEROGENEOUS] = NODE_MASK_NONE,
+#endif
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
diff --git a/mm/vm_object.c b/mm/vm_object.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac1a115e4ee13d1b5a84c8a705caea0d132c534c
--- /dev/null
+++ b/mm/vm_object.c
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Logical Mapping Management
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi zhu, chao Liu
+ *
+ */
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/xxhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/hashtable.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+#include <linux/mempolicy.h>
+#include <linux/gmem.h>
+#include <linux/xarray.h>
+#include <linux/vm_object.h>
+
+/*
+ * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA
+ * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA
+ * gets changed: merge, split, adjust
+ */
+static struct kmem_cache *vm_object_cachep;
+static struct kmem_cache *gm_mapping_cachep;
+
+/* gm_mapping will not be release dynamically */
+gm_mapping_t *alloc_gm_mapping(void)
+{
+	gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL);
+
+	if (!gm_mapping)
+		return NULL;
+
+	set_gm_mapping_nomap(gm_mapping);
+	mutex_init(&gm_mapping->lock);
+
+	return gm_mapping;
+}
+EXPORT_SYMBOL(alloc_gm_mapping);
+
+static inline void release_gm_mapping(gm_mapping_t *mapping)
+{
+	kmem_cache_free(gm_mapping_cachep, mapping);
+}
+
+static inline gm_mapping_t *lookup_gm_mapping(vm_object_t *obj, unsigned long pindex)
+{
+	return xa_load(obj->logical_page_table, pindex);
+}
+
+int __init vm_object_init(void)
+{
+	vm_object_cachep = KMEM_CACHE(vm_object, 0);
+	if (!vm_object_cachep)
+		goto out;
+
+	gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0);
+	if (!gm_mapping_cachep)
+		goto free_vm_object;
+
+	return 0;
+free_vm_object:
+	kmem_cache_destroy(vm_object_cachep);
+out:
+	return -ENOMEM;
+}
+
+/*
+ * Create a VM_OBJECT and attach it to a VMA
+ * This should be called when a VMA is created.
+ */
+vm_object_t *vm_object_create(struct vm_area_struct *vma)
+{
+	vm_object_t *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL);
+
+	if (!obj)
+		return NULL;
+
+	spin_lock_init(&obj->lock);
+	obj->vma = vma;
+
+	/*
+	 * The logical page table maps linear_page_index(obj->vma, va)
+	 * to pointers of struct gm_mapping.
+	 */
+	obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL);
+	if (!obj->logical_page_table) {
+		kmem_cache_free(vm_object_cachep, obj);
+		return NULL;
+	}
+
+	xa_init(obj->logical_page_table);
+	atomic_set(&obj->nr_pages, 0);
+	atomic_set(&obj->ref_count, 1);
+
+	return obj;
+}
+
+/* This should be called when a VMA no longer refers to a VM_OBJECT */
+void vm_object_drop_locked(struct vm_area_struct *vma)
+{
+	vm_object_t *obj = vma->vm_obj;
+
+	if (!obj) {
+		pr_err("vm_object: vm_obj of the vma is NULL\n");
+		return;
+	}
+
+	/*
+	 * We must enter this with VMA write-locked, which is unfortunately a giant lock.
+	 * Note that Linux 6.0 has per-VMA lock:
+	 * https://lwn.net/Articles/906852/
+	 * https://lwn.net/Articles/906833/
+	 */
+	free_gm_mappings(vma);
+	mmap_assert_write_locked(vma->vm_mm);
+	vma->vm_obj = NULL;
+
+	if (atomic_dec_and_test(&obj->ref_count)) {
+		xa_destroy(obj->logical_page_table);
+		kfree(obj->logical_page_table);
+		kmem_cache_free(vm_object_cachep, obj);
+	}
+}
+
+void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src)
+{
+	unsigned long index;
+	gm_mapping_t *mapping;
+	unsigned long moved_pages = 0;
+
+	XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start));
+
+	xa_lock(dst->vm_obj->logical_page_table);
+	rcu_read_lock();
+	xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) {
+		index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff +
+			((src->vm_start - dst->vm_start) >> PAGE_SHIFT);
+		__xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL);
+		moved_pages++;
+	}
+	rcu_read_unlock();
+	atomic_add(moved_pages, &dst->vm_obj->nr_pages);
+	xa_unlock(dst->vm_obj->logical_page_table);
+}
+
+void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+	/* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */
+	unsigned long removed_pages = 0;
+	gm_mapping_t *mapping;
+
+	XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start));
+
+	xas_lock(&xas);
+	if (vma->vm_start < start) {
+		xas_for_each(&xas, mapping, linear_page_index(vma, start)) {
+			xas_store(&xas, NULL);
+			removed_pages++;
+		}
+	}
+
+	if (vma->vm_end > end) {
+		xas_set(&xas, linear_page_index(vma, end));
+
+		xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) {
+			xas_store(&xas, NULL);
+			removed_pages++;
+		}
+	}
+	atomic_sub(removed_pages, &vma->vm_obj->nr_pages);
+	xas_unlock(&xas);
+}
+
+/*
+ * Given a VA, the page_index is computed by
+ * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address)
+ */
+struct gm_mapping *vm_object_lookup(vm_object_t *obj, gm_va_t va)
+{
+	return lookup_gm_mapping(obj, linear_page_index(obj->vma, va));
+}
+EXPORT_SYMBOL_GPL(vm_object_lookup);
+
+void vm_object_mapping_create(vm_object_t *obj, gm_va_t start)
+{
+	pgoff_t index = linear_page_index(obj->vma, start);
+	gm_mapping_t *gm_mapping;
+
+	gm_mapping = alloc_gm_mapping();
+	if (!gm_mapping)
+		return;
+
+	__xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL);
+}
+
+void free_gm_mappings(struct vm_area_struct *vma)
+{
+	gm_mapping_t *gm_mapping;
+	XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start));
+
+	xa_lock(vma->vm_obj->logical_page_table);
+	xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) {
+		release_gm_mapping(gm_mapping);
+		xas_store(&xas, NULL);
+	}
+	xa_unlock(vma->vm_obj->logical_page_table);
+}