diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 792688beb75700d2feada3a52f282c4712b61488..89ee0715a1de4e7e6cd79fdaa2f01015b0ff5e58 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2360,3 +2360,36 @@ source "drivers/acpi/Kconfig" source "arch/arm64/kvm/Kconfig" +menuconfig ASCEND_FEATURES + bool "Support Ascend Features" + depends on ARM64 + help + The Ascend chip use the Hisilicon DaVinci architecture, and mainly + focus on AI and machine leanring area, contains many external features. + Enable this config to enable selective list of these features. + If unsure, say N + +if ASCEND_FEATURES + +config EXTEND_HUGEPAGE_MAPPING + bool "Extend for hugepages mapping" + depends on ARM64 + default n + help + Introduce vmalloc/vmap/remap interfaces that handle only hugepages. + This allow the user to do huge vmalloc and remap those hugepage range + into userspace. + If unsure, say N + +config ASCEND_SHARE_POOL + bool "Enable support for the Share Pool Memory" + default n + depends on HAVE_ARCH_HUGE_VMALLOC && EXTEND_HUGEPAGE_MAPPING + select ARCH_USES_HIGH_VMA_FLAGS + help + This feature allows multiple processes to share virtual memory both + in kernel and user level, which is only enabled for ascend platform. + To enable this feature, enable_ascend_share_pool bootarg is needed. + If unsure, say N + +endif # ASCEND_FEATURES diff --git a/fs/proc/base.c b/fs/proc/base.c index 420e1d572856e13222eb1dba2af912a28708d141..8c5e46dd9e86150a2d262504227787096a18eabf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -3425,6 +3426,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", 0444, proc_sp_group_state), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3770,6 +3774,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", 0444, proc_sp_group_state), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d041aa9f0feb32c61f507097708b13dce9db6ec..55d32527500de71da7f74f0fbb3a00db3fdd6cae 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -264,6 +264,26 @@ long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +#ifdef CONFIG_ASCEND_FEATURES +int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, struct page *hpage); +int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr); +#else /* CONFIG_ASCEND_FEATURES */ +static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, struct page *hpage) +{ + return -EPERM; +} +static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr) +{ + return -EPERM; +} +#endif /* CONFIG_ASCEND_FEATURES */ + #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) @@ -470,6 +490,18 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } +static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, struct page *hpage) +{ + return -EPERM; +} +static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr) +{ + return -EPERM; +} + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index d232de7cdc569f88adb5e2fe5d978fcaa68b2cc2..ac6902805a57b537bb360ea53b7b259f1d30cf4d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -184,6 +184,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); +extern long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm); #else struct mempolicy {}; @@ -293,5 +296,13 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) return false; } +static inline long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) +{ + return 0; +} + + #endif /* CONFIG_NUMA */ #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 15eb40f2cd97b27c28512aa9b0f82c34336dfbb8..406fd6d9f429a9ca7044a62eb05d17e24644da2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -374,6 +374,12 @@ extern unsigned int kobjsize(const void *objp); # define VM_MTE_ALLOWED VM_NONE #endif +#if defined(CONFIG_ASCEND_SHARE_POOL) +# define VM_SHARE_POOL VM_HIGH_ARCH_4 +#else +# define VM_SHARE_POOL VM_NONE +#endif + #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif @@ -3146,6 +3152,10 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade); @@ -3159,6 +3169,8 @@ extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct list_head *uf, bool downgrade); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); +extern int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, + int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ @@ -3166,6 +3178,11 @@ static inline void mm_populate(unsigned long addr, unsigned long len) } #else static inline void mm_populate(unsigned long addr, unsigned long len) {} +static inline int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, + int ignore_errors) +{ + return -EPERM; +} #endif /* These take the mm semaphore themselves */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 98a13fb411bfd8b366f2a3fab4d10f9986237ac8..b9d065c7c5827210cc966b070cae97871ba1a3e9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -616,6 +616,9 @@ struct vm_area_struct { #ifdef CONFIG_GMEM struct vm_object *vm_obj; #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + struct sp_area *spa; +#endif } __randomize_layout; #ifdef CONFIG_SCHED_MM_CID @@ -855,6 +858,10 @@ struct mm_struct { #endif } __randomize_layout; +#if IS_ENABLED(CONFIG_ASCEND_SHARE_POOL) + struct sp_group_master *sp_group_master; +#endif + /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..449b0ab6a526db781c078a318a6c62ee91cfda5d --- /dev/null +++ b/include/linux/share_pool.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_SHARE_POOL_H +#define LINUX_SHARE_POOL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SP_HUGEPAGE (1 << 0) +#define SP_HUGEPAGE_ONLY (1 << 1) +#define SP_DVPP (1 << 2) +#define SP_SPEC_NODE_ID (1 << 3) +#define SP_PROT_RO (1 << 16) +/* + * SP_PROT_FOCUS should used with SP_PROT_RO, + * to alloc a memory within sharepool ro memory. + */ +#define SP_PROT_FOCUS (1 << 17) + +#define DEVICE_ID_BITS 4UL +#define DEVICE_ID_MASK ((1UL << DEVICE_ID_BITS) - 1UL) +#define DEVICE_ID_SHIFT 32UL +#define NODE_ID_BITS NODES_SHIFT +#define NODE_ID_MASK ((1UL << NODE_ID_BITS) - 1UL) +#define NODE_ID_SHIFT (DEVICE_ID_SHIFT + DEVICE_ID_BITS) + +#define SP_FLAG_MASK (SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \ + SP_SPEC_NODE_ID | SP_PROT_RO | SP_PROT_FOCUS | \ + (DEVICE_ID_MASK << DEVICE_ID_SHIFT) | \ + (NODE_ID_MASK << NODE_ID_SHIFT)) + +#define sp_flags_device_id(flags) (((flags) >> DEVICE_ID_SHIFT) & DEVICE_ID_MASK) +#define sp_flags_node_id(flags) (((flags) >> NODE_ID_SHIFT) & NODE_ID_MASK) + +#define SPG_ID_NONE (-1) /* not associated with sp_group, only for specified thread */ +#define SPG_ID_DEFAULT 0 /* use the spg id of current thread */ +#define SPG_ID_MIN 1 /* valid id should be >= 1 */ +#define SPG_ID_MAX 99999 +#define SPG_ID_AUTO_MIN 100000 +#define SPG_ID_AUTO_MAX 199999 +#define SPG_ID_AUTO 200000 /* generate group id automatically */ +#define SPG_ID_LOCAL_MIN 200001 +#define SPG_ID_LOCAL_MAX 299999 +#define SPG_ID_LOCAL 300000 /* generate group id in local range */ + +#define MAX_DEVID 8 /* the max num of Da-vinci devices */ + +extern struct static_key_false share_pool_enabled_key; + +struct sp_walk_data { + struct page **pages; + unsigned int page_count; + unsigned long uva_aligned; + unsigned long page_size; + bool is_hugepage; + bool is_page_type_set; + pmd_t *pmd; +}; + +#define MAP_SHARE_POOL 0x200000 + +#define MMAP_TOP_4G_SIZE 0x100000000UL + +/* 8T - 64G size */ +#define MMAP_SHARE_POOL_NORMAL_SIZE 0x7F000000000UL +/* 64G */ +#define MMAP_SHARE_POOL_RO_SIZE 0x1000000000UL +/* 8T size*/ +#define MMAP_SHARE_POOL_DVPP_SIZE 0x80000000000UL +/* 16G size */ +#define MMAP_SHARE_POOL_16G_SIZE 0x400000000UL +/* skip 8T for stack */ +#define MMAP_SHARE_POOL_SKIP 0x80000000000UL +#define MMAP_SHARE_POOL_END (TASK_SIZE - MMAP_SHARE_POOL_SKIP) +#define MMAP_SHARE_POLL_DVPP_END (MMAP_SHARE_POOL_END) +/* MMAP_SHARE_POOL_DVPP_START should be align to 16G */ +#define MMAP_SHARE_POOL_DVPP_START (MMAP_SHARE_POLL_DVPP_END - MMAP_SHARE_POOL_DVPP_SIZE) +#define MMAP_SHARE_POOL_RO_END (MMAP_SHARE_POOL_DVPP_START) +#define MMAP_SHARE_POOL_RO_START (MMAP_SHARE_POOL_RO_END - MMAP_SHARE_POOL_RO_SIZE) +#define MMAP_SHARE_POOL_NORMAL_END (MMAP_SHARE_POOL_RO_START) +#define MMAP_SHARE_POOL_NORMAL_START (MMAP_SHARE_POOL_NORMAL_END - MMAP_SHARE_POOL_NORMAL_SIZE) +#define MMAP_SHARE_POOL_START (MMAP_SHARE_POOL_NORMAL_START) + +#define MMAP_SHARE_POOL_DYNAMIC_DVPP_BASE 0x100000000000ULL +#define MMAP_SHARE_POOL_DYNAMIC_DVPP_END (MMAP_SHARE_POOL_DYNAMIC_DVPP_BASE + \ + MMAP_SHARE_POOL_16G_SIZE * 64) + +#ifdef CONFIG_ASCEND_SHARE_POOL + +static inline void sp_init_mm(struct mm_struct *mm) +{ + mm->sp_group_master = NULL; +} + +/* + * Those interfaces are exported for modules + */ +extern int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id); +extern int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num); +extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); + +extern void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id); +extern void *mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id, + nodemask_t nodemask); +extern int mg_sp_free(unsigned long addr, int id); + +extern void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int tgid, int spg_id); +extern void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid); +extern int mg_sp_unshare(unsigned long va, unsigned long size, int spg_id); + +extern int mg_sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data); + +extern void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data); + +extern bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int tgid); + +extern bool mg_is_sharepool_addr(unsigned long addr); + +extern int mg_sp_id_of_current(void); + +extern void sp_area_drop(struct vm_area_struct *vma); +extern void sp_mm_clean(struct mm_struct *mm); +vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags); +extern bool sp_check_addr(unsigned long addr); +extern bool sp_check_mmap_addr(unsigned long addr, unsigned long flags); +extern int sp_node_id(struct vm_area_struct *vma); + +static inline bool sp_is_enabled(void) +{ + return static_branch_likely(&share_pool_enabled_key); +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ + if (sp_is_enabled()) + info->high_limit = min(info->high_limit, MMAP_SHARE_POOL_START); +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + if (sp_is_enabled() && (vm_flags & VM_SHARE_POOL)) + return true; + + return false; +} + +#else /* CONFIG_ASCEND_SHARE_POOL */ + +static inline int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id) +{ + return -EPERM; +} + +static inline void sp_mm_clean(struct mm_struct *mm) +{ +} + +static inline int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num) +{ + return -EPERM; +} + +static inline int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return -EPERM; +} + +static inline void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + return NULL; +} + +static inline int mg_sp_free(unsigned long addr, int id) +{ + return -EPERM; +} + +static inline void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int tgid, int spg_id) +{ + return NULL; +} + +static inline void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid) +{ + return NULL; +} + +static inline int mg_sp_unshare(unsigned long va, unsigned long size, int id) +{ + return -EPERM; +} + +static inline int mg_sp_id_of_current(void) +{ + return -EPERM; +} + +static inline void sp_init_mm(struct mm_struct *mm) +{ +} + +static inline void sp_area_drop(struct vm_area_struct *vma) +{ +} + +static inline int mg_sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} + +static inline void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ +} + +static inline bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int tgid) +{ + return false; +} + +static inline bool mg_is_sharepool_addr(unsigned long addr) +{ + return false; +} + +static inline void spa_overview_show(struct seq_file *seq) +{ +} + +static inline void spg_overview_show(struct seq_file *seq) +{ +} + +static inline bool sp_is_enabled(void) +{ + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool sp_check_addr(unsigned long addr) +{ + return false; +} + +static inline bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) +{ + return false; +} + +static inline vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) +{ + return VM_FAULT_SIGBUS; +} + +static inline int sp_node_id(struct vm_area_struct *vma) +{ + return numa_node_id(); +} + +#endif /* !CONFIG_ASCEND_SHARE_POOL */ + +#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8ddde9cc947c685e64923139c66c3f9..99a0e91aed1a2b65e476978b159c8c51f6058e23 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -36,6 +36,12 @@ struct iov_iter; /* in uio.h */ #define VM_DEFER_KMEMLEAK 0 #endif +#ifdef CONFIG_EXTEND_HUGEPAGE_MAPPING +#define VM_HUGE_PAGES 0x00001000 /* Allocate hugepages only */ +#else +#define VM_HUGE_PAGES 0 +#endif + /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -173,6 +179,18 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); +#ifdef CONFIG_EXTEND_HUGEPAGE_MAPPING +extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size); +extern void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot); +extern int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, + unsigned long pgoff, unsigned long size); +extern int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, + void *addr, unsigned long pgoff); +#endif + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() diff --git a/kernel/fork.c b/kernel/fork.c index 7e11bdaef25777e8abf98994894d8b6d409bf47f..75acc20ca58545fc168b43553db0decdc6acbd59 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,7 @@ #include #include #include +#include #ifdef CONFIG_GMEM #include @@ -1327,6 +1328,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) goto fail_pcpu; + sp_init_mm(mm); mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; @@ -1368,6 +1370,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + sp_mm_clean(mm); mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/Makefile b/mm/Makefile index cc147c0d7ca0c0faaf7a86c174bdd9aecf7b431e..9dc607e36f4b542d227b1104bb522074394582ef 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -139,3 +139,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_ETMEM) += etmem.o +obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o diff --git a/mm/gup.c b/mm/gup.c index bbe4162365933e5d52f84c34037cb7d0949bc64d..636016248f38c97715d3b56605ffd7f8de689327 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1604,16 +1604,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, return ret; } -/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_lock must not be held. - */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, + int ignore_errors) { - struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; @@ -1664,6 +1657,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_lock must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + return do_mm_populate(current->mm, start, len, ignore_errors); +} #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 04db457a32abfa054cd880733898b8e5273d3da8..5b2750ce30e6d21ca38742cca5198ab11561cd6f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -5474,6 +5475,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); + + /* sharepool k2u mapped pages are marked special */ + if (sp_check_vm_share_pool(vma->vm_flags) && pte_special(pte)) { + spin_unlock(ptl); + continue; + } + if (huge_pte_dirty(pte)) set_page_dirty(page); /* Leave a uffd-wp pte marker if needed */ @@ -7726,3 +7734,73 @@ static void __init hugetlb_cma_check(void) } #endif /* CONFIG_CMA */ + +#ifdef CONFIG_ASCEND_FEATURES +static pte_t *hugetlb_huge_pte_alloc(struct mm_struct *mm, unsigned long addr, + unsigned long size) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pte_t *ptep = NULL; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + return NULL; + + ptep = (pte_t *)pmd_alloc(mm, pudp, addr); + + return ptep; +} + +static int __hugetlb_insert_hugepage(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, unsigned long pfn) +{ + int ret = 0; + pte_t *ptep, entry; + struct hstate *h; + spinlock_t *ptl; + + h = size_to_hstate(PMD_SIZE); + if (!h) + return -EINVAL; + + ptep = hugetlb_huge_pte_alloc(mm, addr, huge_page_size(h)); + if (!ptep) + return -ENXIO; + + if (WARN_ON(ptep && !pte_none(*ptep) && !pmd_huge(*(pmd_t *)ptep))) + return -ENXIO; + + entry = pfn_pte(pfn, prot); + entry = huge_pte_mkdirty(entry); + if (!(pgprot_val(prot) & PTE_RDONLY)) + entry = huge_pte_mkwrite(entry); + entry = pte_mkyoung(entry); + entry = pte_mkhuge(entry); + entry = pte_mkspecial(entry); + + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); + set_huge_pte_at(mm, addr, ptep, entry); + spin_unlock(ptl); + + return ret; +} + +int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, struct page *hpage) +{ + return __hugetlb_insert_hugepage(mm, addr, prot, page_to_pfn(hpage)); +} +EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte); + +int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, unsigned long addr, + pgprot_t prot, unsigned long phy_addr) +{ + return __hugetlb_insert_hugepage(mm, addr, prot, phy_addr >> PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte_by_pa); +#endif /* CONFIG_ASCEND_FEATURES */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5ed13fe2bd75397745ff60fb28ebcbe604644a92..76855b3d1fbe9b2377fa26f4ab9705815213e5c3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1245,11 +1245,10 @@ static struct page *new_page(struct page *page, unsigned long start) } #endif -static long do_mbind(unsigned long start, unsigned long len, +long __do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, - nodemask_t *nmask, unsigned long flags) + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct mempolicy *new; @@ -1356,6 +1355,13 @@ static long do_mbind(unsigned long start, unsigned long len, return err; } +static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + return __do_mbind(start, len, mode, mode_flags, nmask, flags, current->mm); +} + /* * User space interface with variable sized bitmaps for nodelists. */ diff --git a/mm/mmap.c b/mm/mmap.c index 91cad54a353fecc4d102749ef9de7951615fdb29..bff71a766e0aa8df6acc4e90080386b6ae01902f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_GMEM #include @@ -146,6 +147,7 @@ static void remove_vma(struct vm_area_struct *vma, bool unreachable) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); + sp_area_drop(vma); if (unreachable) __vm_area_free(vma); else @@ -974,6 +976,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; + /* don't merge this kind of vma as sp_area couldn't be merged */ + if (sp_check_vm_share_pool(vm_flags)) + return NULL; + /* Does the input range span an existing VMA? (cases 5 - 8) */ curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); @@ -1264,16 +1270,18 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } +static unsigned long __mmap_region(struct mm_struct *mm, + struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf); /* * The caller must write-lock current->mm->mmap_lock. */ -unsigned long do_mmap(struct file *file, unsigned long addr, +unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { - struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; int pkey = 0; validate_mm(mm); @@ -1344,7 +1352,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -1454,7 +1462,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_PEER_SHARED; #endif - addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + addr = __mmap_region(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1462,6 +1470,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return addr; } +unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + unsigned long pgoff, unsigned long *populate, struct list_head *uf) +{ + return __do_mmap_mm(current->mm, file, addr, len, prot, flags, 0, pgoff, populate, uf); +} + unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) @@ -1716,6 +1731,7 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; + sp_area_work_around(info); if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else @@ -2763,11 +2779,11 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, } #endif -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) +static unsigned long __mmap_region(struct mm_struct *mm, + struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct vm_area_struct *next, *prev, *merge; pgoff_t pglen = len >> PAGE_SHIFT; @@ -3049,6 +3065,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return error; } +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + return __mmap_region(current->mm, file, addr, len, vm_flags, pgoff, uf); +} + static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; diff --git a/mm/share_pool.c b/mm/share_pool.c new file mode 100644 index 0000000000000000000000000000000000000000..7a64ebc300ea792faeb4e3f494b0ee21c30f16dd --- /dev/null +++ b/mm/share_pool.c @@ -0,0 +1,3495 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Huawei Ascend Share Pool Memory + * + * Copyright (C) 2020 Huawei Limited + * Author: Tang Yizhou + * Zefan Li + * Wu Peng + * Ding Tianhong + * Zhou Guanghui + * Li Ming + * + * This code is based on the hisilicon ascend platform. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) "share pool: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Use spa va address as mmap offset. This can work because spa_file + * is setup with 64-bit address space. So va shall be well covered. + */ +#define addr_offset(spa) ((spa)->va_start) + +#define byte2kb(size) ((size) >> 10) +#define byte2mb(size) ((size) >> 20) +#define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10)) + +#define MAX_GROUP_FOR_SYSTEM 50000 +#define MAX_GROUP_FOR_TASK 3000 +#define MAX_PROC_PER_GROUP 1024 + +#define GROUP_NONE 0 + +#define SEC2US(sec) ((sec) * 1000000) +#define NS2US(ns) ((ns) / 1000) + +#define PF_DOMAIN_CORE 0x10000000 /* AOS CORE processes in sched.h */ + +static int system_group_count; + +/* idr of all sp_groups */ +static DEFINE_IDR(sp_group_idr); +/* rw semaphore for sp_group_idr and mm->sp_group_master */ +static DECLARE_RWSEM(sp_global_sem); + +/*** Statistical and maintenance tools ***/ + +/* list of all sp_group_masters */ +static LIST_HEAD(master_list); +/* mutex to protect insert/delete ops from master_list */ +static DEFINE_MUTEX(master_list_lock); + +/* list of all spm-dvpp */ +static LIST_HEAD(spm_dvpp_list); +/* mutex to protect insert/delete ops from master_list */ +static DEFINE_MUTEX(spm_list_lock); + +#define SEQ_printf(m, x...) \ +do { \ + if (m) \ + seq_printf(m, x); \ + else \ + pr_info(x); \ +} while (0) + +struct sp_meminfo { + /* not huge page size from sp_alloc */ + atomic64_t alloc_nsize; + /* huge page size from sp_alloc */ + atomic64_t alloc_hsize; + /* total size from sp_k2u */ + atomic64_t k2u_size; +}; + +enum sp_mapping_type { + SP_MAPPING_START, + SP_MAPPING_DVPP = SP_MAPPING_START, + SP_MAPPING_NORMAL, + SP_MAPPING_RO, + SP_MAPPING_END, +}; + +/* + * address space management + */ +struct sp_mapping { + unsigned long type; + atomic_t user; + unsigned long start[MAX_DEVID]; + unsigned long end[MAX_DEVID]; + struct rb_root area_root; + + struct rb_node *free_area_cache; + unsigned long cached_hole_size; + unsigned long cached_vstart; + + /* list head for all groups attached to this mapping, dvpp mapping only */ + struct list_head group_head; + struct list_head spm_node; + spinlock_t sp_mapping_lock; +}; + +/* Processes in the same sp_group can share memory. + * Memory layout for share pool: + * + * |-------------------- 8T -------------------|---|------ 8T ------------| + * | Device 0 | Device 1 |...| | + * |----------------------------------------------------------------------| + * |------------- 16G -------------| 16G | | | + * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | + * | sp | sp | | | | | + * |----------------------------------------------------------------------| + * + * The host SVM feature reserves 8T virtual memory by mmap, and due to the + * restriction of DVPP, while SVM and share pool will both allocate memory + * for DVPP, the memory have to be in the same 32G range. + * + * Share pool reserves 16T memory, with 8T for normal uses and 8T for DVPP. + * Within this 8T DVPP memory, SVM will call sp_config_dvpp_range() to + * tell us which 16G memory range is reserved for share pool . + * + * In some scenarios where there is no host SVM feature, share pool uses + * the default 8G memory setting for DVPP. + */ +struct sp_group { + int id; + struct file *file; + struct file *file_hugetlb; + /* number of process in this group */ + int proc_num; + /* list head of processes (sp_group_node, each represents a process) */ + struct list_head proc_head; + /* it is protected by rw_lock of this spg */ + struct rb_root spa_root; + /* group statistics */ + struct sp_meminfo meminfo; + atomic_t use_count; + atomic_t spa_num; + /* protect the group internal elements */ + struct rw_semaphore rw_lock; + /* list node for dvpp mapping */ + struct list_head mnode; + struct sp_mapping *mapping[SP_MAPPING_END]; +}; + +/* a per-process(per mm) struct which manages a sp_group_node list */ +struct sp_group_master { + pid_t tgid; + /* + * number of sp groups the process belongs to, + * a.k.a the number of sp_node in group_head + */ + unsigned int group_num; + /* list head of sp_node */ + struct list_head group_head; + struct mm_struct *mm; + /* + * Used to apply for the shared pool memory of the current process. + * For example, sp_alloc non-share memory or k2task. + */ + struct sp_group *local; + struct sp_meminfo meminfo; + struct list_head list_node; + char comm[TASK_COMM_LEN]; +}; + +/* + * each instance represents an sp group the process belongs to + * sp_group_master : sp_group_node = 1 : N + * sp_group_node->spg : sp_group = 1 : 1 + * sp_group_node : sp_group->proc_head = N : 1 + */ +struct sp_group_node { + /* list node in sp_group->proc_head */ + struct list_head proc_node; + /* list node in sp_group_maseter->group_head */ + struct list_head group_node; + struct sp_group_master *master; + struct sp_group *spg; + unsigned long prot; + + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + struct sp_meminfo meminfo; +}; + +static inline void sp_add_group_master(struct sp_group_master *master) +{ + mutex_lock(&master_list_lock); + list_add_tail(&master->list_node, &master_list); + mutex_unlock(&master_list_lock); +} + +static inline void sp_del_group_master(struct sp_group_master *master) +{ + mutex_lock(&master_list_lock); + list_del(&master->list_node); + mutex_unlock(&master_list_lock); +} + +static void meminfo_init(struct sp_meminfo *meminfo) +{ + memset(meminfo, 0, sizeof(struct sp_meminfo)); +} + +static void meminfo_inc_usage(unsigned long size, bool huge, struct sp_meminfo *meminfo) +{ + if (huge) + atomic64_add(size, &meminfo->alloc_hsize); + else + atomic64_add(size, &meminfo->alloc_nsize); +} + +static void meminfo_dec_usage(unsigned long size, bool huge, struct sp_meminfo *meminfo) +{ + if (huge) + atomic64_sub(size, &meminfo->alloc_hsize); + else + atomic64_sub(size, &meminfo->alloc_nsize); +} + +static void meminfo_inc_k2u(unsigned long size, struct sp_meminfo *meminfo) +{ + atomic64_add(size, &meminfo->k2u_size); +} + +static void meminfo_dec_k2u(unsigned long size, struct sp_meminfo *meminfo) +{ + atomic64_sub(size, &meminfo->k2u_size); +} + +static inline long meminfo_alloc_sum(struct sp_meminfo *meminfo) +{ + return atomic64_read(&meminfo->alloc_nsize) + + atomic64_read(&meminfo->alloc_hsize); +} + +static inline long meminfo_alloc_sum_byKB(struct sp_meminfo *meminfo) +{ + return byte2kb(meminfo_alloc_sum(meminfo)); +} + +static inline long meminfo_k2u_size(struct sp_meminfo *meminfo) +{ + return byte2kb(atomic64_read(&meminfo->k2u_size)); +} + +static inline long long meminfo_total_size(struct sp_meminfo *meminfo) +{ + return atomic64_read(&meminfo->alloc_nsize) + + atomic64_read(&meminfo->alloc_hsize) + + atomic64_read(&meminfo->k2u_size); +} + +static unsigned long sp_mapping_type(struct sp_mapping *spm) +{ + return spm->type; +} + +static void sp_mapping_set_type(struct sp_mapping *spm, unsigned long type) +{ + spm->type = type; +} + +static struct sp_mapping *sp_mapping_normal; +static struct sp_mapping *sp_mapping_ro; + +static void sp_mapping_add_to_list(struct sp_mapping *spm) +{ + mutex_lock(&spm_list_lock); + if (sp_mapping_type(spm) == SP_MAPPING_DVPP) + list_add_tail(&spm->spm_node, &spm_dvpp_list); + mutex_unlock(&spm_list_lock); +} + +static void sp_mapping_remove_from_list(struct sp_mapping *spm) +{ + mutex_lock(&spm_list_lock); + if (sp_mapping_type(spm) == SP_MAPPING_DVPP) + list_del(&spm->spm_node); + mutex_unlock(&spm_list_lock); +} + +static void sp_mapping_range_init(struct sp_mapping *spm) +{ + int i; + + for (i = 0; i < MAX_DEVID; i++) { + switch (sp_mapping_type(spm)) { + case SP_MAPPING_RO: + spm->start[i] = MMAP_SHARE_POOL_RO_START; + spm->end[i] = MMAP_SHARE_POOL_RO_END; + break; + case SP_MAPPING_NORMAL: + spm->start[i] = MMAP_SHARE_POOL_NORMAL_START; + spm->end[i] = MMAP_SHARE_POOL_NORMAL_END; + break; + case SP_MAPPING_DVPP: + spm->start[i] = MMAP_SHARE_POOL_DVPP_START + i * MMAP_SHARE_POOL_16G_SIZE; + spm->end[i] = spm->start[i] + MMAP_SHARE_POOL_16G_SIZE; + break; + default: + pr_err("Invalid sp_mapping type [%lu]\n", sp_mapping_type(spm)); + break; + } + } +} + +static struct sp_mapping *sp_mapping_create(unsigned long type) +{ + struct sp_mapping *spm; + + spm = kzalloc(sizeof(struct sp_mapping), GFP_KERNEL); + if (!spm) + return NULL; + + sp_mapping_set_type(spm, type); + sp_mapping_range_init(spm); + atomic_set(&spm->user, 0); + spm->area_root = RB_ROOT; + INIT_LIST_HEAD(&spm->group_head); + spin_lock_init(&spm->sp_mapping_lock); + sp_mapping_add_to_list(spm); + + return spm; +} + +static void sp_mapping_destroy(struct sp_mapping *spm) +{ + sp_mapping_remove_from_list(spm); + kfree(spm); +} + +static void sp_mapping_attach(struct sp_group *spg, struct sp_mapping *spm) +{ + unsigned long type = sp_mapping_type(spm); + + atomic_inc(&spm->user); + spg->mapping[type] = spm; + if (type == SP_MAPPING_DVPP) + list_add_tail(&spg->mnode, &spm->group_head); +} + +static void sp_mapping_detach(struct sp_group *spg, struct sp_mapping *spm) +{ + unsigned long type; + + if (!spm) + return; + + type = sp_mapping_type(spm); + if (type == SP_MAPPING_DVPP) + list_del(&spg->mnode); + if (atomic_dec_and_test(&spm->user)) + sp_mapping_destroy(spm); + + spg->mapping[type] = NULL; +} + +/* merge old mapping to new, and the old mapping would be destroyed */ +static void sp_mapping_merge(struct sp_mapping *new, struct sp_mapping *old) +{ + struct sp_group *spg, *tmp; + + if (new == old) + return; + + list_for_each_entry_safe(spg, tmp, &old->group_head, mnode) { + list_move_tail(&spg->mnode, &new->group_head); + spg->mapping[SP_MAPPING_DVPP] = new; + } + + atomic_add(atomic_read(&old->user), &new->user); + sp_mapping_destroy(old); +} + +static bool is_mapping_empty(struct sp_mapping *spm) +{ + return RB_EMPTY_ROOT(&spm->area_root); +} + +static bool can_mappings_merge(struct sp_mapping *m1, struct sp_mapping *m2) +{ + int i; + + for (i = 0; i < MAX_DEVID; i++) + if (m1->start[i] != m2->start[i] || m1->end[i] != m2->end[i]) + return false; + + return true; +} + +/* + * 1. The mappings of local group is set on creating. + * 2. This is used to setup the mapping for groups created during add_task. + * 3. The normal mapping exists for all groups. + * 4. The dvpp mappings for the new group and local group can merge _iff_ at + * least one of the mapping is empty. + * the caller must hold sp_global_sem + * NOTE: undo the mergeing when the later process failed. + */ +static int sp_group_setup_mapping_normal(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_mapping *local_dvpp_mapping, *spg_dvpp_mapping; + + local_dvpp_mapping = mm->sp_group_master->local->mapping[SP_MAPPING_DVPP]; + spg_dvpp_mapping = spg->mapping[SP_MAPPING_DVPP]; + + if (!list_empty(&spg->proc_head)) { + /* + * Don't return an error when the mappings' address range conflict. + * As long as the mapping is unused, we can drop the empty mapping. + * This may change the address range for the task or group implicitly, + * give a warn for it. + */ + bool is_conflict = !can_mappings_merge(local_dvpp_mapping, spg_dvpp_mapping); + + if (is_mapping_empty(local_dvpp_mapping)) { + sp_mapping_merge(spg_dvpp_mapping, local_dvpp_mapping); + if (is_conflict) + pr_warn_ratelimited("task address space conflict, spg_id=%d\n", + spg->id); + } else if (is_mapping_empty(spg_dvpp_mapping)) { + sp_mapping_merge(local_dvpp_mapping, spg_dvpp_mapping); + if (is_conflict) + pr_warn_ratelimited("group address space conflict, spg_id=%d\n", + spg->id); + } else { + pr_info_ratelimited("Duplicate address space, id=%d\n", spg->id); + return -EINVAL; + } + } else { + /* the mapping of local group is always set */ + sp_mapping_attach(spg, local_dvpp_mapping); + if (!spg->mapping[SP_MAPPING_NORMAL]) + sp_mapping_attach(spg, sp_mapping_normal); + if (!spg->mapping[SP_MAPPING_RO]) + sp_mapping_attach(spg, sp_mapping_ro); + } + + return 0; +} + +static int sp_group_setup_mapping_local(struct mm_struct *mm, struct sp_group *local) +{ + struct sp_mapping *spm; + + spm = sp_mapping_create(SP_MAPPING_DVPP); + if (!spm) + return -ENOMEM; + + sp_mapping_attach(local, spm); + sp_mapping_attach(local, sp_mapping_normal); + sp_mapping_attach(local, sp_mapping_ro); + + return 0; +} + +static inline bool is_local_group(int spg_id) +{ + return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX; +} + +static int sp_group_setup_mapping(struct mm_struct *mm, struct sp_group *spg) +{ + if (is_local_group(spg->id)) + return sp_group_setup_mapping_local(mm, spg); + else + return sp_group_setup_mapping_normal(mm, spg); +} + +static struct sp_group *sp_group_create(int spg_id); +static void sp_group_put_locked(struct sp_group *spg); +static int sp_group_link_task(struct mm_struct *mm, struct sp_group *spg, + unsigned long prot, struct sp_group_node **pnode); +static int init_local_group(struct mm_struct *mm) +{ + int ret; + struct sp_group *spg; + struct sp_group_master *master = mm->sp_group_master; + + spg = sp_group_create(SPG_ID_LOCAL); + if (IS_ERR(spg)) + return PTR_ERR(spg); + + ret = sp_group_link_task(mm, spg, PROT_READ | PROT_WRITE, NULL); + sp_group_put_locked(spg); + if (!ret) + master->local = spg; + + return ret; +} + +/* The caller must hold sp_global_sem and the input @mm cannot be freed */ +static int sp_init_group_master_locked(struct task_struct *tsk, struct mm_struct *mm) +{ + int ret; + struct sp_group_master *master; + + if (mm->sp_group_master) + return 0; + + master = kmalloc(sizeof(struct sp_group_master), GFP_KERNEL); + if (!master) + return -ENOMEM; + + INIT_LIST_HEAD(&master->group_head); + master->group_num = 0; + master->mm = mm; + master->tgid = tsk->tgid; + get_task_comm(master->comm, current); + meminfo_init(&master->meminfo); + mm->sp_group_master = master; + sp_add_group_master(master); + + ret = init_local_group(mm); + if (ret) + goto free_master; + + return 0; + +free_master: + sp_del_group_master(master); + mm->sp_group_master = NULL; + kfree(master); + + return ret; +} + +static int sp_init_group_master(struct task_struct *tsk, struct mm_struct *mm) +{ + int ret; + + down_read(&sp_global_sem); + /* The sp_group_master would never change once set */ + if (mm->sp_group_master) { + up_read(&sp_global_sem); + return 0; + } + up_read(&sp_global_sem); + + down_write(&sp_global_sem); + ret = sp_init_group_master_locked(tsk, mm); + up_write(&sp_global_sem); + + return ret; +} + +static struct sp_group *sp_get_local_group(struct task_struct *tsk, struct mm_struct *mm) +{ + int ret; + struct sp_group_master *master; + + down_read(&sp_global_sem); + master = mm->sp_group_master; + if (master && master->local) { + atomic_inc(&master->local->use_count); + up_read(&sp_global_sem); + return master->local; + } + up_read(&sp_global_sem); + + down_write(&sp_global_sem); + ret = sp_init_group_master_locked(tsk, mm); + if (ret) { + up_write(&sp_global_sem); + return ERR_PTR(ret); + } + master = mm->sp_group_master; + atomic_inc(&master->local->use_count); + up_write(&sp_global_sem); + + return master->local; +} + +static void update_mem_usage_alloc(unsigned long size, bool inc, + bool is_hugepage, struct sp_group_node *spg_node) +{ + if (inc) { + meminfo_inc_usage(size, is_hugepage, &spg_node->meminfo); + meminfo_inc_usage(size, is_hugepage, &spg_node->master->meminfo); + } else { + meminfo_dec_usage(size, is_hugepage, &spg_node->meminfo); + meminfo_dec_usage(size, is_hugepage, &spg_node->master->meminfo); + } +} + +static void update_mem_usage_k2u(unsigned long size, bool inc, + struct sp_group_node *spg_node) +{ + if (inc) { + meminfo_inc_k2u(size, &spg_node->meminfo); + meminfo_inc_k2u(size, &spg_node->master->meminfo); + } else { + meminfo_dec_k2u(size, &spg_node->meminfo); + meminfo_dec_k2u(size, &spg_node->master->meminfo); + } +} + +struct sp_spa_stat { + atomic64_t alloc_num; + atomic64_t k2u_task_num; + atomic64_t k2u_spg_num; + atomic64_t alloc_size; + atomic64_t k2u_task_size; + atomic64_t k2u_spg_size; + atomic64_t dvpp_size; + atomic64_t dvpp_va_size; +}; + +static struct sp_spa_stat spa_stat; + +/* statistics of all sp group born from sp_alloc and k2u(spg) */ +struct sp_overall_stat { + atomic_t spa_total_num; + atomic64_t spa_total_size; +}; + +static struct sp_overall_stat sp_overall_stat; + +/*** Global share pool VA allocator ***/ + +enum spa_type { + SPA_TYPE_ALLOC = 1, + SPA_TYPE_K2TASK, + SPA_TYPE_K2SPG, +}; + +/* + * The lifetime for a sp_area: + * 1. The sp_area was created from a sp_mapping with sp_mapping_lock held. + * 2. The sp_area was added into a sp_group (using rb_tree). + * 3. The sp_area was mapped to all tasks in the sp_group and we bump its + * reference when each mmap succeeded. + * 4. When a new task was added into the sp_group, we map the sp_area into + * the new task and increase its reference count. + * 5. When a task was deleted from the sp_group, we unmap the sp_area for + * the task and decrease its reference count. + * 6. Also, we can use sp_free/sp_unshare to unmap the sp_area for all the + * tasks in the sp_group. And the reference count was decreased for each + * munmap. + * 7. When the refcount for sp_area reach zero: + * a. the sp_area would firstly be deleted from the sp_group and then + * deleted from sp_mapping. + * b. no one should use the sp_area from the view of sp_group. + * c. the spa->spg should not be used when the sp_area is not on a spg. + * + * The locking rules: + * 1. The newly created sp_area with a refcount of one. This is to distinct + * the new sp_area from a dying sp_area. + * 2. Use spg->rw_lock to protect all the sp_area in the sp_group. And the + * sp_area cannot be deleted without spg->rw_lock. + */ +struct sp_area { + unsigned long va_start; + unsigned long va_end; /* va_end always align to hugepage */ + unsigned long real_size; /* real size with alignment */ + unsigned long region_vstart; /* belong to normal region or DVPP region */ + unsigned long flags; + bool is_hugepage; + atomic_t use_count; /* How many vmas use this VA region */ + struct rb_node rb_node; /* address sorted rbtree */ + struct rb_node spg_link; /* link to the spg->rb_root */ + struct sp_group *spg; + struct sp_mapping *spm; /* where spa born from */ + enum spa_type type; + unsigned long kva; /* shared kva */ + pid_t applier; /* the original applier process */ + int preferred_node_id; /* memory node */ + struct work_struct work; +}; + +static unsigned long spa_size(struct sp_area *spa) +{ + return spa->real_size; +} + +static struct file *spa_file(struct sp_area *spa) +{ + if (spa->is_hugepage) + return spa->spg->file_hugetlb; + else + return spa->spg->file; +} + +/* the caller should hold sp_area_lock */ +static void spa_inc_usage(struct sp_area *spa) +{ + enum spa_type type = spa->type; + unsigned long size = spa->real_size; + bool is_dvpp = spa->flags & SP_DVPP; + bool is_huge = spa->is_hugepage; + + switch (type) { + case SPA_TYPE_ALLOC: + atomic64_inc(&spa_stat.alloc_num); + atomic64_add(size, &spa_stat.alloc_size); + meminfo_inc_usage(size, is_huge, &spa->spg->meminfo); + break; + case SPA_TYPE_K2TASK: + atomic64_inc(&spa_stat.k2u_task_num); + atomic64_add(size, &spa_stat.k2u_task_size); + meminfo_inc_k2u(size, &spa->spg->meminfo); + break; + case SPA_TYPE_K2SPG: + atomic64_inc(&spa_stat.k2u_spg_num); + atomic64_add(size, &spa_stat.k2u_spg_size); + meminfo_inc_k2u(size, &spa->spg->meminfo); + break; + default: + WARN(1, "invalid spa type"); + } + + if (is_dvpp) { + atomic64_add(size, &spa_stat.dvpp_size); + atomic64_add(ALIGN(size, PMD_SIZE), &spa_stat.dvpp_va_size); + } + + if (!is_local_group(spa->spg->id)) { + atomic_inc(&sp_overall_stat.spa_total_num); + atomic64_add(size, &sp_overall_stat.spa_total_size); + } +} + +/* the caller should hold sp_area_lock */ +static void spa_dec_usage(struct sp_area *spa) +{ + enum spa_type type = spa->type; + unsigned long size = spa->real_size; + bool is_dvpp = spa->flags & SP_DVPP; + bool is_huge = spa->is_hugepage; + + switch (type) { + case SPA_TYPE_ALLOC: + atomic64_dec(&spa_stat.alloc_num); + atomic64_sub(size, &spa_stat.alloc_size); + meminfo_dec_usage(size, is_huge, &spa->spg->meminfo); + break; + case SPA_TYPE_K2TASK: + atomic64_dec(&spa_stat.k2u_task_num); + atomic64_sub(size, &spa_stat.k2u_task_size); + meminfo_dec_k2u(size, &spa->spg->meminfo); + break; + case SPA_TYPE_K2SPG: + atomic64_dec(&spa_stat.k2u_spg_num); + atomic64_sub(size, &spa_stat.k2u_spg_size); + meminfo_dec_k2u(size, &spa->spg->meminfo); + break; + default: + WARN(1, "invalid spa type"); + } + + if (is_dvpp) { + atomic64_sub(size, &spa_stat.dvpp_size); + atomic64_sub(ALIGN(size, PMD_SIZE), &spa_stat.dvpp_va_size); + } + + if (!is_local_group(spa->spg->id)) { + atomic_dec(&sp_overall_stat.spa_total_num); + atomic64_sub(spa->real_size, &sp_overall_stat.spa_total_size); + } +} + +static void update_mem_usage(unsigned long size, bool inc, bool is_hugepage, + struct sp_group_node *spg_node, enum spa_type type) +{ + switch (type) { + case SPA_TYPE_ALLOC: + update_mem_usage_alloc(size, inc, is_hugepage, spg_node); + break; + case SPA_TYPE_K2TASK: + case SPA_TYPE_K2SPG: + update_mem_usage_k2u(size, inc, spg_node); + break; + default: + WARN(1, "invalid stat type\n"); + } +} + +static inline void check_interrupt_context(void) +{ + if (unlikely(in_interrupt())) + panic("function can't be used in interrupt context\n"); +} + +struct sp_alloc_context { + unsigned long size; + unsigned long size_aligned; + unsigned long sp_flags; + nodemask_t *nodemask; + int preferred_node_id; + bool have_mbind; + enum spa_type type; +}; + +static int sp_map_spa_to_mm(struct mm_struct *mm, struct sp_area *spa, + unsigned long prot, struct sp_alloc_context *ac, + const char *str); + +struct sp_k2u_context { + unsigned long kva; + unsigned long kva_aligned; + unsigned long size; + unsigned long size_aligned; + unsigned long sp_flags; + enum spa_type type; +}; + +static void free_sp_group_locked(struct sp_group *spg) +{ + int type; + + fput(spg->file); + fput(spg->file_hugetlb); + idr_remove(&sp_group_idr, spg->id); + + for (type = SP_MAPPING_START; type < SP_MAPPING_END; type++) + sp_mapping_detach(spg, spg->mapping[type]); + + if (!is_local_group(spg->id)) + system_group_count--; + + kfree(spg); + WARN(system_group_count < 0, "unexpected group count\n"); +} + +static void free_sp_group(struct sp_group *spg) +{ + down_write(&sp_global_sem); + free_sp_group_locked(spg); + up_write(&sp_global_sem); +} + +static void sp_group_put_locked(struct sp_group *spg) +{ + lockdep_assert_held_write(&sp_global_sem); + + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group_locked(spg); +} + +static void sp_group_put(struct sp_group *spg) +{ + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group(spg); +} + +/* use with put_task_struct(task) */ +static int get_task(int tgid, struct task_struct **task) +{ + struct task_struct *tsk; + struct pid *p; + + rcu_read_lock(); + p = find_pid_ns(tgid, &init_pid_ns); + tsk = pid_task(p, PIDTYPE_TGID); + if (!tsk || (tsk->flags & PF_EXITING)) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + + *task = tsk; + return 0; +} + +/* + * the caller must: + * 1. hold spg->rw_lock + * 2. ensure no concurrency problem for mm_struct + */ +static bool is_process_in_group(struct sp_group *spg, + struct mm_struct *mm) +{ + struct sp_group_node *spg_node; + + list_for_each_entry(spg_node, &spg->proc_head, proc_node) + if (spg_node->master->mm == mm) + return true; + + return false; +} + +/* + * Get the sp_group from the mm and return the associated sp_group_node. + * The caller should promise the @mm would not be deleted from the @spg. + */ +static struct sp_group *sp_group_get_from_mm(struct mm_struct *mm, int spg_id, + struct sp_group_node **pnode) +{ + struct sp_group *spg = NULL; + struct sp_group_node *spg_node; + struct sp_group_master *master; + + down_read(&sp_global_sem); + + master = mm->sp_group_master; + if (!master) { + up_read(&sp_global_sem); + return NULL; + } + + if (spg_id == SPG_ID_DEFAULT) { + atomic_inc(&master->local->use_count); + /* There is only one task in the local group */ + *pnode = list_first_entry(&master->local->proc_head, + struct sp_group_node, proc_node); + up_read(&sp_global_sem); + return master->local; + } + + list_for_each_entry(spg_node, &master->group_head, group_node) + if (spg_node->spg->id == spg_id) { + if (atomic_inc_not_zero(&spg_node->spg->use_count)) { + spg = spg_node->spg; + *pnode = spg_node; + } + break; + } + up_read(&sp_global_sem); + + return spg; +} + +/** + * mp_sp_group_id_by_pid() - Get the sp_group ID array of a process. + * @tgid: tgid of target process. + * @spg_ids: point to an array to save the group ids the process belongs to + * @num: input the spg_ids array size; output the spg number of the process + * + * Return: + * >0 - the sp_group ID. + * -ENODEV - target process doesn't belong to any sp_group. + * -EINVAL - spg_ids or num is NULL. + * -E2BIG - the num of groups process belongs to is larger than *num + */ +int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num) +{ + int ret = 0, real_count; + struct sp_group_node *node; + struct sp_group_master *master = NULL; + struct task_struct *tsk; + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + check_interrupt_context(); + + if (!spg_ids || !num || *num <= 0) + return -EINVAL; + + ret = get_task(tgid, &tsk); + if (ret) + return ret; + + down_read(&sp_global_sem); + task_lock(tsk); + if (tsk->mm) + master = tsk->mm->sp_group_master; + task_unlock(tsk); + + if (!master) { + ret = -ENODEV; + goto out_up_read; + } + + /* + * There is a local group for each process which is used for + * passthrough allocation. The local group is a internal + * implementation for convenience and is not attempt to bother + * the user. + */ + real_count = master->group_num - 1; + if (real_count <= 0) { + ret = -ENODEV; + goto out_up_read; + } + if ((unsigned int)*num < real_count) { + ret = -E2BIG; + goto out_up_read; + } + *num = real_count; + + list_for_each_entry(node, &master->group_head, group_node) { + if (is_local_group(node->spg->id)) + continue; + *(spg_ids++) = node->spg->id; + } + +out_up_read: + up_read(&sp_global_sem); + put_task_struct(tsk); + return ret; +} +EXPORT_SYMBOL_GPL(mg_sp_group_id_by_pid); + +static bool is_online_node_id(int node_id) +{ + return node_id >= 0 && node_id < MAX_NUMNODES && node_online(node_id); +} + +static void sp_group_init(struct sp_group *spg, int spg_id) +{ + spg->id = spg_id; + spg->proc_num = 0; + spg->spa_root = RB_ROOT; + atomic_set(&spg->use_count, 1); + atomic_set(&spg->spa_num, 0); + INIT_LIST_HEAD(&spg->proc_head); + INIT_LIST_HEAD(&spg->mnode); + init_rwsem(&spg->rw_lock); + meminfo_init(&spg->meminfo); +} + +/* + * sp_group_create - create a new sp_group + * @spg_id: specify the id for the new sp_group + * + * valid @spg_id: + * SPG_ID_AUTO: + * Allocate a id in range [SPG_ID_AUTO_MIN, APG_ID_AUTO_MAX] + * SPG_ID_LOCAL: + * Allocate a id in range [SPG_ID_LOCAL_MIN, APG_ID_LOCAL_MAX] + * [SPG_ID_MIN, SPG_ID_MAX]: + * Using the input @spg_id for the new sp_group. + * + * Return: the newly created sp_group or an errno. + * Context: The caller should protect sp_group_idr from being access. + */ +static struct sp_group *sp_group_create(int spg_id) +{ + int ret, start, end; + struct sp_group *spg; + char name[DNAME_INLINE_LEN]; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM && + spg_id != SPG_ID_LOCAL)) { + pr_err("reach system max group num\n"); + return ERR_PTR(-ENOSPC); + } + + if (spg_id == SPG_ID_LOCAL) { + start = SPG_ID_LOCAL_MIN; + end = SPG_ID_LOCAL_MAX + 1; + } else if (spg_id == SPG_ID_AUTO) { + start = SPG_ID_AUTO_MIN; + end = SPG_ID_AUTO_MAX + 1; + } else if (spg_id >= SPG_ID_MIN && spg_id <= SPG_ID_MAX) { + start = spg_id; + end = spg_id + 1; + } else { + pr_err("invalid input spg_id:%d\n", spg_id); + return ERR_PTR(-EINVAL); + } + + spg = kzalloc(sizeof(*spg), GFP_KERNEL); + if (spg == NULL) + return ERR_PTR(-ENOMEM); + + ret = idr_alloc(&sp_group_idr, spg, start, end, GFP_KERNEL); + if (ret < 0) { + pr_err("group %d idr alloc failed %d\n", spg_id, ret); + goto out_kfree; + } + spg_id = ret; + + sprintf(name, "sp_group_%d", spg_id); + spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, VM_NORESERVE); + if (IS_ERR(spg->file)) { + pr_err("spg file setup failed %ld\n", PTR_ERR(spg->file)); + ret = PTR_ERR(spg->file); + goto out_idr_remove; + } + + sprintf(name, "sp_group_%d_huge", spg_id); + spg->file_hugetlb = hugetlb_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(spg->file_hugetlb)) { + pr_err("spg file_hugetlb setup failed %ld\n", PTR_ERR(spg->file_hugetlb)); + ret = PTR_ERR(spg->file_hugetlb); + goto out_fput; + } + + sp_group_init(spg, spg_id); + + if (!is_local_group(spg_id)) + system_group_count++; + + return spg; + +out_fput: + fput(spg->file); +out_idr_remove: + idr_remove(&sp_group_idr, spg_id); +out_kfree: + kfree(spg); + return ERR_PTR(ret); +} + +/* the caller must hold sp_global_sem */ +static struct sp_group *sp_group_get_or_alloc(int spg_id) +{ + struct sp_group *spg; + + spg = idr_find(&sp_group_idr, spg_id); + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = sp_group_create(spg_id); + + return spg; +} + +/* the caller must hold sp_global_sem */ +static struct sp_group_node *spg_node_alloc(struct mm_struct *mm, + unsigned long prot, struct sp_group *spg) +{ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group_node *spg_node; + + spg_node = kzalloc(sizeof(struct sp_group_node), GFP_KERNEL); + if (!spg_node) + return NULL; + + INIT_LIST_HEAD(&spg_node->group_node); + INIT_LIST_HEAD(&spg_node->proc_node); + spg_node->spg = spg; + spg_node->master = master; + spg_node->prot = prot; + meminfo_init(&spg_node->meminfo); + + return spg_node; +} + +/* + * sp_group_link_task - Actually add a task into a group + * @mm: specify the input task + * @spg: the sp_group + * @prot: read/write protection for the task in the group + * + * The input @mm and @spg must have been initialized properly and could not + * be freed during the sp_group_link_task(). + * the caller must hold sp_global_sem. + */ +static int sp_group_link_task(struct mm_struct *mm, struct sp_group *spg, + unsigned long prot, struct sp_group_node **pnode) +{ + int ret; + struct sp_group_node *node; + struct sp_group_master *master = mm->sp_group_master; + + if (master->group_num == MAX_GROUP_FOR_TASK) { + pr_err("task reaches max group num\n"); + return -ENOSPC; + } + + if (is_process_in_group(spg, mm)) { + pr_err("task already in target group(%d)\n", spg->id); + return -EEXIST; + } + + if (spg->proc_num + 1 == MAX_PROC_PER_GROUP) { + pr_err("add group: group(%d) reaches max process num\n", spg->id); + return -ENOSPC; + } + + node = spg_node_alloc(mm, prot, spg); + if (!node) + return -ENOMEM; + + ret = sp_group_setup_mapping(mm, spg); + if (ret) + goto out_kfree; + + /* + * We pin only the mm_struct instead of the memory space of the target mm. + * So we must ensure the existence of the memory space via mmget_not_zero + * before we would access it. + */ + mmgrab(mm); + master->group_num++; + list_add_tail(&node->group_node, &master->group_head); + atomic_inc(&spg->use_count); + spg->proc_num++; + list_add_tail(&node->proc_node, &spg->proc_head); + if (pnode) + *pnode = node; + + return 0; + +out_kfree: + kfree(node); + + return ret; +} + +static void sp_group_unlink_task(struct sp_group_node *spg_node) +{ + struct sp_group *spg = spg_node->spg; + struct sp_group_master *master = spg_node->master; + + list_del(&spg_node->proc_node); + spg->proc_num--; + list_del(&spg_node->group_node); + master->group_num--; + + mmdrop(master->mm); + sp_group_put_locked(spg); + kfree(spg_node); +} + +/* + * Find and initialize the mm of the task specified by @tgid. + * We increace the usercount for the mm on success. + */ +static int mm_add_group_init(pid_t tgid, struct mm_struct **pmm) +{ + int ret; + struct mm_struct *mm; + struct task_struct *tsk; + + ret = get_task(tgid, &tsk); + if (ret) + return ret; + + /* + * group_leader: current thread may be exiting in a multithread process + * + * DESIGN IDEA + * We increase mm->mm_users deliberately to ensure it's decreased in + * share pool under only 2 circumstances, which will simply the overall + * design as mm won't be freed unexpectedly. + * + * The corresponding refcount decrements are as follows: + * 1. the error handling branch of THIS function. + * 2. In sp_group_exit(). It's called only when process is exiting. + */ + mm = get_task_mm(tsk->group_leader); + if (!mm) { + ret = -ESRCH; + goto out_put_task; + } + + ret = sp_init_group_master(tsk, mm); + if (ret) + goto out_put_mm; + + if (mm->sp_group_master && mm->sp_group_master->tgid != tgid) { + pr_err("add: task(%d) is a vfork child of the original task(%d)\n", + tgid, mm->sp_group_master->tgid); + ret = -EINVAL; + goto out_put_mm; + } + *pmm = mm; + +out_put_mm: + if (ret) + mmput(mm); +out_put_task: + put_task_struct(tsk); + + return ret; +} + +static void sp_area_put_locked(struct sp_area *spa); +static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); +/** + * mg_sp_group_add_task() - Add a process to an share group (sp_group). + * @tgid: the tgid of the task to be added. + * @prot: the prot of task for this spg. + * @spg_id: the ID of the sp_group. + * + * Return: A postive group number for success, -errno on failure. + * + * Valid @spg_id: + * [SPG_ID_MIN, SPG_ID_MAX]: + * the task would be added to the group with @spg_id, if the + * group doesn't exist, just create it. + * [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX]: + * the task would be added to the group with @spg_id, if it + * doesn't exist ,return failed. + * SPG_ID_AUTO: + * the task would be added into a new group with a new id in range + * [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX]. + * + * This function can be taken into four parts: + * 1. Check and initlize the task specified by @tgid properly. + * 2. Create or get the spg specified by @spg_id. + * 3. Check the spg and task together and link the task into the spg if + * everything looks good. + * 4. Map the existing sp_area from the spg into the new task. + */ +int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id) +{ + int ret = 0; + struct sp_area *spa; + struct mm_struct *mm; + struct sp_group *spg; + struct rb_node *p, *n; + struct sp_group_node *spg_node; + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + check_interrupt_context(); + + /* only allow READ, READ | WRITE */ + if (!((prot == PROT_READ) || (prot == (PROT_READ | PROT_WRITE)))) { + pr_err_ratelimited("prot is invalid 0x%lx\n", prot); + return -EINVAL; + } + + if (spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) { + pr_err_ratelimited("add group failed, invalid group id %d\n", spg_id); + return -EINVAL; + } + + ret = mm_add_group_init(tgid, &mm); + if (ret < 0) + return ret; + + down_write(&sp_global_sem); + spg = sp_group_get_or_alloc(spg_id); + if (IS_ERR(spg)) { + ret = PTR_ERR(spg); + goto out_unlock; + } + /* save spg_id before we release sp_global_sem, or UAF may occur */ + spg_id = spg->id; + + down_write(&spg->rw_lock); + ret = sp_group_link_task(mm, spg, prot, &spg_node); + if (ret < 0) + goto put_spg; + + /* + * create mappings of existing shared memory segments into this + * new process' page table. + */ + for (p = rb_first(&spg->spa_root); p; p = n) { + n = rb_next(p); + spa = container_of(p, struct sp_area, spg_link); + + if (!atomic_inc_not_zero(&spa->use_count)) { + pr_warn("be careful, add new task(%d) to an exiting group(%d)\n", + tgid, spg_id); + continue; + } + + ret = sp_map_spa_to_mm(mm, spa, prot, NULL, "add_task"); + sp_area_put_locked(spa); + if (ret) { + pr_warn("mmap old spa to new task failed, %d\n", ret); + /* it makes no scene to skip error for coredump here */ + ret = ret < 0 ? ret : -EFAULT; + + for (p = rb_prev(p); p; p = n) { + n = rb_prev(p); + spa = container_of(p, struct sp_area, spg_link); + if (!atomic_inc_not_zero(&spa->use_count)) + continue; + sp_munmap(mm, spa->va_start, spa_size(spa)); + sp_area_put_locked(spa); + } + sp_group_unlink_task(spg_node); + break; + } + } +put_spg: + up_write(&spg->rw_lock); + sp_group_put_locked(spg); +out_unlock: + up_write(&sp_global_sem); + /* We put the mm_struct later to protect the mm from exiting while sp_mmap */ + mmput(mm); + + return ret < 0 ? ret : spg_id; +} +EXPORT_SYMBOL_GPL(mg_sp_group_add_task); + +int mg_sp_id_of_current(void) +{ + int ret, spg_id; + struct sp_group_master *master; + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + if ((current->flags & PF_KTHREAD) || !current->mm) + return -EINVAL; + + down_read(&sp_global_sem); + master = current->mm->sp_group_master; + if (master) { + spg_id = master->local->id; + up_read(&sp_global_sem); + return spg_id; + } + up_read(&sp_global_sem); + + down_write(&sp_global_sem); + ret = sp_init_group_master_locked(current, current->mm); + if (ret) { + up_write(&sp_global_sem); + return ret; + } + master = current->mm->sp_group_master; + spg_id = master->local->id; + up_write(&sp_global_sem); + + return spg_id; +} +EXPORT_SYMBOL_GPL(mg_sp_id_of_current); + +#define insert_sp_area(__root, __spa, node, rb_node_param) \ + do { \ + struct rb_node **p = &((__root)->rb_node); \ + struct rb_node *parent = NULL; \ + while (*p) { \ + struct sp_area *tmp; \ + parent = *p; \ + tmp = rb_entry(parent, struct sp_area, rb_node_param);\ + if (__spa->va_start < tmp->va_end) \ + p = &(*p)->rb_left; \ + else if (__spa->va_end > tmp->va_start) \ + p = &(*p)->rb_right; \ + else \ + WARN(1, "duplicate spa of tree " #__root);\ + } \ + rb_link_node((node), parent, p); \ + rb_insert_color((node), (__root)); \ + } while (0) + +/* the caller must hold sp_mapping_lock */ +static void spm_insert_area(struct sp_mapping *spm, struct sp_area *spa) +{ + insert_sp_area(&spm->area_root, spa, &spa->rb_node, rb_node); +} + +static void sp_group_insert_area(struct sp_group *spg, struct sp_area *spa) +{ + insert_sp_area(&spg->spa_root, spa, &spa->spg_link, spg_link); + atomic_inc(&spg->spa_num); + spa_inc_usage(spa); + if (atomic_read(&spg->spa_num) == 1) + atomic_inc(&spg->use_count); +} + +/* + * The caller must hold spg->rw_lock. + * Return true to indicate that ths spa_num in the spg reaches zero and the caller + * should drop the extra spg->use_count added in sp_group_insert_area(). + */ +static bool sp_group_delete_area(struct sp_group *spg, struct sp_area *spa) +{ + rb_erase(&spa->spg_link, &spg->spa_root); + spa_dec_usage(spa); + return atomic_dec_and_test(&spa->spg->spa_num); +} + +/** + * sp_area_alloc() - Allocate a region of VA from the share pool. + * @size: the size of VA to allocate. + * @flags: how to allocate the memory. + * @spg: the share group that the memory is allocated to. + * @type: the type of the region. + * @applier: the tgid of the task which allocates the region. + * + * Return: a valid pointer for success, NULL on failure. + */ +static struct sp_area *sp_area_alloc(unsigned long size, unsigned long flags, + struct sp_group *spg, enum spa_type type, + pid_t applier, int node_id) +{ + int device_id; + struct sp_area *spa, *first, *err; + struct rb_node *n; + unsigned long vstart; + unsigned long vend; + unsigned long addr; + unsigned long size_align = ALIGN(size, PMD_SIZE); /* va aligned to 2M */ + struct sp_mapping *mapping; + + device_id = sp_flags_device_id(flags); + if (device_id < 0 || device_id >= MAX_DEVID) { + pr_err("invalid device id %d\n", device_id); + return ERR_PTR(-EINVAL); + } + + if (flags & SP_PROT_FOCUS) { + if ((flags & (SP_DVPP | SP_PROT_RO)) != SP_PROT_RO) { + pr_err("invalid sp_flags [%lx]\n", flags); + return ERR_PTR(-EINVAL); + } + mapping = spg->mapping[SP_MAPPING_RO]; + } else if (flags & SP_DVPP) { + mapping = spg->mapping[SP_MAPPING_DVPP]; + } else { + mapping = spg->mapping[SP_MAPPING_NORMAL]; + } + + if (!mapping) { + pr_err_ratelimited("non DVPP spg, id %d\n", spg->id); + return ERR_PTR(-EINVAL); + } + + vstart = mapping->start[device_id]; + vend = mapping->end[device_id]; + spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); + if (unlikely(!spa)) + return ERR_PTR(-ENOMEM); + + spin_lock(&mapping->sp_mapping_lock); + + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the sp_area cached in free_area_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_area_cache. + * Note that sp_area_free may update free_area_cache + * without updating cached_hole_size. + */ + if (!mapping->free_area_cache || size_align < mapping->cached_hole_size || + vstart != mapping->cached_vstart) { + mapping->cached_hole_size = 0; + mapping->free_area_cache = NULL; + } + + /* record if we encounter less permissive parameters */ + mapping->cached_vstart = vstart; + + /* find starting point for our search */ + if (mapping->free_area_cache) { + first = rb_entry(mapping->free_area_cache, struct sp_area, rb_node); + addr = first->va_end; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + } else { + addr = vstart; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + n = mapping->area_root.rb_node; + first = NULL; + + while (n) { + struct sp_area *tmp; + + tmp = rb_entry(n, struct sp_area, rb_node); + if (tmp->va_end >= addr) { + first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else + n = n->rb_right; + } + + if (!first) + goto found; + } + + /* from the starting point, traverse areas until a suitable hole is found */ + while (addr + size_align > first->va_start && addr + size_align <= vend) { + if (addr + mapping->cached_hole_size < first->va_start) + mapping->cached_hole_size = first->va_start - addr; + addr = first->va_end; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + +found: + if (addr + size_align > vend) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + spa->va_start = addr; + spa->va_end = addr + size_align; + spa->real_size = size; + spa->region_vstart = vstart; + spa->flags = flags; + spa->is_hugepage = (flags & SP_HUGEPAGE); + spa->spg = spg; + spa->spm = mapping; + spa->type = type; + spa->kva = 0; /* NULL pointer */ + spa->applier = applier; + spa->preferred_node_id = node_id; + atomic_set(&spa->use_count, 1); + + /* the link location could be saved before, to be optimized */ + spm_insert_area(mapping, spa); + mapping->free_area_cache = &spa->rb_node; + + spin_unlock(&mapping->sp_mapping_lock); + sp_group_insert_area(spg, spa); + + return spa; + +error: + spin_unlock(&mapping->sp_mapping_lock); + kfree(spa); + return err; +} + +/* + * Find a spa with key @addr from @spg and increase its use_count. + * The caller should hold spg->rw_lock + */ +static struct sp_area *sp_area_get(struct sp_group *spg, + unsigned long addr) +{ + struct rb_node *n = spg->spa_root.rb_node; + + while (n) { + struct sp_area *spa; + + spa = rb_entry(n, struct sp_area, spg_link); + if (addr < spa->va_start) { + n = n->rb_left; + } else if (addr > spa->va_start) { + n = n->rb_right; + } else { + /* a spa without any user will die soon */ + if (atomic_inc_not_zero(&spa->use_count)) + return spa; + else + return NULL; + } + } + + return NULL; +} + +/* + * Free the VA region starting from addr to the share pool + */ +static void sp_area_free(struct sp_area *spa) +{ + struct sp_mapping *spm = spa->spm; + + spin_lock(&spm->sp_mapping_lock); + if (spm->free_area_cache) { + struct sp_area *cache; + + cache = rb_entry(spm->free_area_cache, struct sp_area, rb_node); + if (spa->va_start <= cache->va_start) { + spm->free_area_cache = rb_prev(&spa->rb_node); + /* + * the new cache node may be changed to another region, + * i.e. from DVPP region to normal region + */ + if (spm->free_area_cache) { + cache = rb_entry(spm->free_area_cache, + struct sp_area, rb_node); + spm->cached_vstart = cache->region_vstart; + } + /* + * We don't try to update cached_hole_size, + * but it won't go very wrong. + */ + } + } + + rb_erase(&spa->rb_node, &spm->area_root); + spin_unlock(&spm->sp_mapping_lock); + RB_CLEAR_NODE(&spa->rb_node); + kfree(spa); +} + +static void sp_area_put_locked(struct sp_area *spa) +{ + if (atomic_dec_and_test(&spa->use_count)) { + if (sp_group_delete_area(spa->spg, spa)) + /* the caller must hold a refcount for spa->spg under spg->rw_lock */ + atomic_dec(&spa->spg->use_count); + sp_area_free(spa); + } +} + +static void sp_area_drop_func(struct work_struct *work) +{ + bool spa_zero; + struct sp_area *spa = container_of(work, struct sp_area, work); + struct sp_group *spg = spa->spg; + + down_write(&spg->rw_lock); + spa_zero = sp_group_delete_area(spg, spa); + up_write(&spg->rw_lock); + sp_area_free(spa); + if (spa_zero) + sp_group_put(spg); +} + +void sp_area_drop(struct vm_area_struct *vma) +{ + struct sp_area *spa = vma->spa; + + if (!(vma->vm_flags & VM_SHARE_POOL)) + return; + + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma(). Before A calls this func, + * B calls sp_free() to free the same spa. So spa maybe NULL when A + * calls this func later. + */ + if (!spa) + return; + + if (atomic_dec_and_test(&spa->use_count)) { + INIT_WORK(&spa->work, sp_area_drop_func); + schedule_work(&spa->work); + } +} + +/* + * The function calls of do_munmap() won't change any non-atomic member + * of struct sp_group. Please review the following chain: + * do_munmap -> remove_vma_list -> remove_vma -> sp_area_drop -> + * sp_area_free + */ +static void sp_munmap(struct mm_struct *mm, unsigned long addr, + unsigned long size) +{ + int err; + + down_write(&mm->mmap_lock); + if (unlikely(!mmget_not_zero(mm))) { + up_write(&mm->mmap_lock); + pr_warn("munmap: target mm is exiting\n"); + return; + } + + err = do_munmap(mm, addr, size, NULL); + /* we are not supposed to fail */ + if (err) + pr_err("failed to unmap VA %pK when sp munmap, %d\n", (void *)addr, err); + + up_write(&mm->mmap_lock); + mmput_async(mm); +} + +/* The caller should hold the write lock for spa->spg->rw_lock */ +static void __sp_free(struct sp_area *spa, struct mm_struct *stop) +{ + struct mm_struct *mm; + struct sp_group_node *spg_node = NULL; + + list_for_each_entry(spg_node, &spa->spg->proc_head, proc_node) { + mm = spg_node->master->mm; + if (mm == stop) + break; + sp_munmap(mm, spa->va_start, spa_size(spa)); + } +} + +/* Free the memory of the backing shmem or hugetlbfs */ +static void sp_fallocate(struct sp_area *spa) +{ + int ret; + unsigned long mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + unsigned long offset = addr_offset(spa); + + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + WARN(1, "sp fallocate failed %d\n", ret); +} + +static struct sp_group *sp_group_get_from_idr(int spg_id) +{ + struct sp_group *spg; + + down_read(&sp_global_sem); + spg = idr_find(&sp_group_idr, spg_id); + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; + up_read(&sp_global_sem); + + return spg; +} + +static int sp_free_inner(unsigned long addr, int spg_id, bool is_sp_free) +{ + int ret = 0; + struct sp_area *spa; + struct sp_group *spg; + struct sp_group_node *spg_node; + const char *str = is_sp_free ? "sp_free" : "unshare_uva"; + + if (!current->mm) + spg = sp_group_get_from_idr(spg_id); + else + spg = sp_group_get_from_mm(current->mm, spg_id, &spg_node); + if (!spg) { + pr_err("%s, get group failed %d\n", str, spg_id); + return -EINVAL; + } + + down_write(&spg->rw_lock); + spa = sp_area_get(spg, addr); + if (!spa) { + pr_debug("%s, invalid input addr %lx\n", str, addr); + ret = -EINVAL; + goto drop_spg; + } + + if ((is_sp_free && spa->type != SPA_TYPE_ALLOC) || + (!is_sp_free && spa->type == SPA_TYPE_ALLOC)) { + ret = -EINVAL; + pr_warn("%s failed, spa_type is not correct\n", str); + goto drop_spa; + } + + if (!current->mm && spa->applier != current->tgid) { + ret = -EPERM; + pr_err("%s, free a spa allocated by other process(%d), current(%d)\n", + str, spa->applier, current->tgid); + goto drop_spa; + } + + __sp_free(spa, NULL); + if (spa->type == SPA_TYPE_ALLOC) + sp_fallocate(spa); + + if (current->mm) + update_mem_usage(spa_size(spa), false, spa->is_hugepage, spg_node, spa->type); + +drop_spa: + sp_area_put_locked(spa); +drop_spg: + up_write(&spg->rw_lock); + sp_group_put(spg); + return ret; +} + +/** + * mg_sp_free() - Free the memory allocated by mg_sp_alloc() or + * mg_sp_alloc_nodemask(). + * + * @addr: the starting VA of the memory. + * @id: Address space identifier, which is used to distinguish the addr. + * + * Return: + * * 0 - success. + * * -EINVAL - the memory can't be found or was not allocated by share pool. + * * -EPERM - the caller has no permision to free the memory. + */ +int mg_sp_free(unsigned long addr, int id) +{ + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + check_interrupt_context(); + + if (current->flags & PF_KTHREAD) + return -EINVAL; + + return sp_free_inner(addr, id, true); +} +EXPORT_SYMBOL_GPL(mg_sp_free); + +/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_lock). */ +static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate, + unsigned long prot) +{ + unsigned long addr = spa->va_start; + unsigned long size = spa_size(spa); + unsigned long flags = MAP_FIXED_NOREPLACE | MAP_SHARED | MAP_POPULATE | + MAP_SHARE_POOL; + unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; + unsigned long pgoff = addr_offset(spa) >> PAGE_SHIFT; + struct vm_area_struct *vma; + + if (spa->flags & SP_PROT_RO) + prot &= ~PROT_WRITE; + + atomic_inc(&spa->use_count); + addr = __do_mmap_mm(mm, file, addr, size, prot, flags, vm_flags, pgoff, + populate, NULL); + if (IS_ERR_VALUE(addr)) { + atomic_dec(&spa->use_count); + pr_err("do_mmap fails %ld\n", addr); + return addr; + } + + vma = find_vma(mm, addr); + vma->spa = spa; + + if (prot & PROT_WRITE) + /* clean PTE_RDONLY flags or trigger SMMU event */ + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | + PTE_DIRTY); + else + vm_flags_clear(vma, VM_MAYWRITE); + + return addr; +} + +static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, + int spg_id, struct sp_alloc_context *ac) +{ + int device_id, node_id; + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + check_interrupt_context(); + + device_id = sp_flags_device_id(sp_flags); + node_id = sp_flags & SP_SPEC_NODE_ID ? sp_flags_node_id(sp_flags) : device_id; + if (!is_online_node_id(node_id)) { + pr_err_ratelimited("invalid numa node id %d\n", node_id); + return -EINVAL; + } + ac->preferred_node_id = node_id; + + if (current->flags & PF_KTHREAD) { + pr_err_ratelimited("allocation failed, task is kthread\n"); + return -EINVAL; + } + + if (unlikely(!size || (size >> PAGE_SHIFT) > totalram_pages())) { + pr_err_ratelimited("allocation failed, invalid size %lu\n", size); + return -EINVAL; + } + + if (spg_id != SPG_ID_DEFAULT && (spg_id < SPG_ID_MIN || spg_id >= SPG_ID_AUTO)) { + pr_err_ratelimited("allocation failed, invalid group id %d\n", spg_id); + return -EINVAL; + } + + if (sp_flags & (~SP_FLAG_MASK)) { + pr_err_ratelimited("allocation failed, invalid flag %lx\n", sp_flags); + return -EINVAL; + } + + if (sp_flags & SP_HUGEPAGE_ONLY) + sp_flags |= SP_HUGEPAGE; + + if (spg_id == SPG_ID_DEFAULT) { + /* + * We should first init the group_master in pass through scene and + * don't free it until we release the mm_struct. + */ + int ret = sp_init_group_master(current, current->mm); + + if (ret) { + pr_err("sp_alloc init local mapping failed %d\n", ret); + return ret; + } + } + + ac->type = SPA_TYPE_ALLOC; + ac->size = size; + ac->sp_flags = sp_flags; + ac->have_mbind = false; + ac->size_aligned = (sp_flags & SP_HUGEPAGE) ? ALIGN(size, PMD_SIZE) : + ALIGN(size, PAGE_SIZE); + + return 0; +} + +static bool sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac) +{ + /* + * If hugepage allocation fails, this will transfer to normal page + * and try again. (only if SP_HUGEPAGE_ONLY is not flagged + */ + if (!(ac->sp_flags & SP_HUGEPAGE) || (ac->sp_flags & SP_HUGEPAGE_ONLY)) + return false; + + ac->size_aligned = ALIGN(ac->size, PAGE_SIZE); + ac->sp_flags &= ~SP_HUGEPAGE; + /* + * The mempolicy for shared memory is located at backend file, which varies + * between normal pages and huge pages. So we should set the mbind policy again + * when we retry using normal pages. + */ + ac->have_mbind = false; + sp_area_put_locked(spa); + return true; +} + +static long sp_mbind(struct mm_struct *mm, unsigned long start, unsigned long len, + nodemask_t *nodemask) +{ + return __do_mbind(start, len, MPOL_BIND, MPOL_F_STATIC_NODES, + nodemask, MPOL_MF_STRICT, mm); +} + +static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, + unsigned long populate, struct sp_alloc_context *ac) +{ + int ret; + + if (ac && !ac->have_mbind) { + ret = sp_mbind(mm, spa->va_start, spa->real_size, ac->nodemask); + if (ret < 0) { + pr_err("cannot bind the memory range to node[%*pbl], err:%d\n", + nodemask_pr_args(ac->nodemask), ret); + return ret; + } + ac->have_mbind = true; + } + + /* + * We are not ignoring errors, so if we fail to allocate + * physical memory we just return failure, so we won't encounter + * page fault later on, and more importantly sp_make_share_u2k() + * depends on this feature (and MAP_LOCKED) to work correctly. + */ + ret = do_mm_populate(mm, spa->va_start, populate, 0); + if (ret) { + if (unlikely(fatal_signal_pending(current))) + pr_warn("allocation failed, current thread is killed\n"); + else + pr_warn("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", + ret); + } + + return ret; +} + +static int sp_k2u_populate(struct mm_struct *mm, struct sp_area *spa); + +#define SP_SKIP_ERR 1 +/* + * The caller should increase the refcnt of the spa to prevent that we map + * a dead spa into a mm_struct. + */ +static int sp_map_spa_to_mm(struct mm_struct *mm, struct sp_area *spa, + unsigned long prot, struct sp_alloc_context *ac, + const char *str) +{ + int ret; + unsigned long mmap_addr; + unsigned long populate = 0; + + down_write(&mm->mmap_lock); + if (unlikely(!mmget_not_zero(mm))) { + up_write(&mm->mmap_lock); + pr_warn("sp_map: target mm is exiting\n"); + return SP_SKIP_ERR; + } + + /* when success, mmap_addr == spa->va_start */ + mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); + if (IS_ERR_VALUE(mmap_addr)) { + up_write(&mm->mmap_lock); + mmput_async(mm); + pr_err("%s, sp mmap failed %ld\n", str, mmap_addr); + return (int)mmap_addr; + } + + if (spa->type == SPA_TYPE_ALLOC) { + up_write(&mm->mmap_lock); + ret = sp_alloc_populate(mm, spa, populate, ac); + if (ret) { + down_write(&mm->mmap_lock); + do_munmap(mm, mmap_addr, spa_size(spa), NULL); + up_write(&mm->mmap_lock); + } + } else { + ret = sp_k2u_populate(mm, spa); + if (ret) { + do_munmap(mm, mmap_addr, spa_size(spa), NULL); + pr_info("k2u populate failed, %d\n", ret); + } + up_write(&mm->mmap_lock); + } + mmput_async(mm); + + return ret; +} + +static int sp_alloc_mmap_populate(struct sp_area *spa, struct sp_alloc_context *ac) +{ + int ret = -EINVAL; + int mmap_ret = 0; + struct mm_struct *mm; + struct sp_group_node *spg_node; + + /* create mapping for each process in the group */ + list_for_each_entry(spg_node, &spa->spg->proc_head, proc_node) { + mm = spg_node->master->mm; + mmap_ret = sp_map_spa_to_mm(mm, spa, spg_node->prot, ac, "sp_alloc"); + if (mmap_ret) { + /* + * Goto fallback procedure upon ERR_VALUE, + * but skip the coredump situation, + * because we don't want one misbehaving process to affect others. + */ + if (mmap_ret != SP_SKIP_ERR) + goto unmap; + + continue; + } + ret = mmap_ret; + } + + return ret; + +unmap: + __sp_free(spa, mm); + + /* + * Sometimes do_mm_populate() allocates some memory and then failed to + * allocate more. (e.g. memory use reaches cgroup limit.) + * In this case, it will return enomem, but will not free the + * memory which has already been allocated. + * + * So if sp_map_spa_to_mm fails, always call sp_fallocate() + * to make sure backup physical memory of the shared file is freed. + */ + sp_fallocate(spa); + + return mmap_ret; +} + +static void *__mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id, + nodemask_t *nodemask) +{ + int ret = 0; + struct sp_area *spa; + struct sp_group *spg; + nodemask_t __nodemask; + struct sp_alloc_context ac; + struct sp_group_node *spg_node; + + ret = sp_alloc_prepare(size, sp_flags, spg_id, &ac); + if (ret) + return ERR_PTR(ret); + + if (!nodemask) { /* mg_sp_alloc */ + nodes_clear(__nodemask); + node_set(ac.preferred_node_id, __nodemask); + ac.nodemask = &__nodemask; + } else /* mg_sp_alloc_nodemask */ + ac.nodemask = nodemask; + + spg = sp_group_get_from_mm(current->mm, spg_id, &spg_node); + if (!spg) { + pr_err("allocation failed, can't find group(%d)\n", spg_id); + return ERR_PTR(-ENODEV); + } + + down_write(&spg->rw_lock); +try_again: + spa = sp_area_alloc(ac.size_aligned, ac.sp_flags, spg, + ac.type, current->tgid, ac.preferred_node_id); + if (IS_ERR(spa)) { + up_write(&spg->rw_lock); + pr_err("alloc spa failed in allocation(potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + ret = PTR_ERR(spa); + goto out; + } + + ret = sp_alloc_mmap_populate(spa, &ac); + if (ret == -ENOMEM && sp_alloc_fallback(spa, &ac)) + goto try_again; + + if (!ret) + update_mem_usage(spa_size(spa), true, spa->is_hugepage, spg_node, spa->type); + + sp_area_put_locked(spa); + up_write(&spg->rw_lock); + +out: + sp_group_put(spg); + if (ret) + return ERR_PTR(ret); + else + return (void *)(spa->va_start); +} + +void *mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id, + nodemask_t nodemask) +{ + return __mg_sp_alloc_nodemask(size, sp_flags, spg_id, &nodemask); +} +EXPORT_SYMBOL_GPL(mg_sp_alloc_nodemask); + +/** + * mg_sp_alloc() - Allocate shared memory for all the processes in a sp_group. + * @size: the size of memory to allocate. + * @sp_flags: how to allocate the memory. + * @spg_id: the share group that the memory is allocated to. + * + * Use pass through allocation if spg_id == SPG_ID_DEFAULT in multi-group mode. + * + * Return: + * * if succeed, return the starting address of the shared memory. + * * if fail, return the pointer of -errno. + */ +void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + return __mg_sp_alloc_nodemask(size, sp_flags, spg_id, NULL); +} +EXPORT_SYMBOL_GPL(mg_sp_alloc); + +/** + * is_vmap_hugepage() - Check if a kernel address belongs to vmalloc family. + * @addr: the kernel space address to be checked. + * + * Return: + * * >0 - a vmalloc hugepage addr. + * * =0 - a normal vmalloc addr. + * * -errno - failure. + */ +static int is_vmap_hugepage(unsigned long addr) +{ + struct vm_struct *area; + + if (unlikely(!addr)) { + pr_err_ratelimited("null vmap addr pointer\n"); + return -EINVAL; + } + + area = find_vm_area((void *)addr); + if (unlikely(!area)) { + pr_debug("can't find vm area(%lx)\n", addr); + return -EINVAL; + } + + if (area->flags & VM_HUGE_PAGES) + return 1; + else + return 0; +} + +static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn = -EINVAL; + + /* sp_make_share_k2u only support vmalloc address */ + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + + return pfn; +} + +static int sp_k2u_populate(struct mm_struct *mm, struct sp_area *spa) +{ + int ret; + struct vm_area_struct *vma; + unsigned long kva = spa->kva; + unsigned long addr, buf, offset; + + /* This should not fail because we hold the mmap_lock sicne mmap */ + vma = find_vma(mm, spa->va_start); + if (is_vm_hugetlb_page(vma)) { + ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); + if (ret) { + pr_debug("remap vmalloc hugepage failed, ret %d, kva is %lx\n", + ret, (unsigned long)kva); + return ret; + } + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + } else { + addr = kva; + offset = 0; + buf = spa->va_start; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + pr_err("remap_pfn_range failed %d\n", ret); + return ret; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); + } + + return 0; +} + +static int sp_k2u_prepare(unsigned long kva, unsigned long size, + unsigned long sp_flags, int spg_id, struct sp_k2u_context *kc) +{ + int is_hugepage, ret; + unsigned int page_size = PAGE_SIZE; + unsigned long kva_aligned, size_aligned; + + check_interrupt_context(); + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + if (!size) { + pr_err_ratelimited("k2u input size is 0.\n"); + return -EINVAL; + } + + if (sp_flags & ~SP_FLAG_MASK) { + pr_err_ratelimited("k2u sp_flags %lx error\n", sp_flags); + return -EINVAL; + } + sp_flags &= ~SP_HUGEPAGE; + + if (!current->mm) { + pr_err_ratelimited("k2u: kthread is not allowed\n"); + return -EPERM; + } + + is_hugepage = is_vmap_hugepage(kva); + if (is_hugepage > 0) { + sp_flags |= SP_HUGEPAGE; + page_size = PMD_SIZE; + } else if (is_hugepage == 0) { + /* do nothing */ + } else { + pr_err_ratelimited("k2u kva is not vmalloc address\n"); + return is_hugepage; + } + + if (spg_id == SPG_ID_DEFAULT) { + ret = sp_init_group_master(current, current->mm); + if (ret) { + pr_err("k2u_task init local mapping failed %d\n", ret); + return ret; + } + } + + /* aligned down kva is convenient for caller to start with any valid kva */ + kva_aligned = ALIGN_DOWN(kva, page_size); + size_aligned = ALIGN(kva + size, page_size) - kva_aligned; + + kc->kva = kva; + kc->kva_aligned = kva_aligned; + kc->size = size; + kc->size_aligned = size_aligned; + kc->sp_flags = sp_flags; + kc->type = (spg_id == SPG_ID_DEFAULT) ? SPA_TYPE_K2TASK : SPA_TYPE_K2SPG; + + return 0; +} + +/** + * mg_sp_make_share_k2u() - Share kernel memory to current process or an sp_group. + * @kva: the VA of shared kernel memory. + * @size: the size of shared kernel memory. + * @sp_flags: how to allocate the memory. We only support SP_DVPP. + * @tgid: the tgid of the specified process (Not currently in use). + * @spg_id: the share group that the memory is shared to. + * + * Return: the shared target user address to start at + * + * Share kernel memory to current task if spg_id == SPG_ID_NONE + * or SPG_ID_DEFAULT in multi-group mode. + * + * Return: + * * if succeed, return the shared user address to start at. + * * if fail, return the pointer of -errno. + */ +void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int tgid, int spg_id) +{ + int mmap_ret, ret; + struct sp_area *spa; + struct sp_group *spg; + struct sp_k2u_context kc; + struct sp_group_node *spg_node, *ori_node; + + spg_id = (spg_id == SPG_ID_NONE) ? SPG_ID_DEFAULT : spg_id; + ret = sp_k2u_prepare(kva, size, sp_flags, spg_id, &kc); + if (ret) + return ERR_PTR(ret); + + spg = sp_group_get_from_mm(current->mm, spg_id, &ori_node); + if (!spg) { + pr_err("k2u failed, can't find group(%d)\n", spg_id); + return ERR_PTR(-ENODEV); + } + + down_write(&spg->rw_lock); + spa = sp_area_alloc(kc.size_aligned, kc.sp_flags, spg, kc.type, current->tgid, 0); + if (IS_ERR(spa)) { + up_write(&spg->rw_lock); + pr_err("alloc spa failed in k2u_spg (potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + sp_group_put(spg); + return spa; + } + + ret = -EINVAL; + spa->kva = kc.kva_aligned; + list_for_each_entry(spg_node, &spg->proc_head, proc_node) { + struct mm_struct *mm = spg_node->master->mm; + + mmap_ret = sp_map_spa_to_mm(mm, spa, spg_node->prot, NULL, "k2u"); + if (mmap_ret) { + if (mmap_ret == SP_SKIP_ERR) + continue; + pr_err("remap k2u to spg failed %d\n", mmap_ret); + __sp_free(spa, mm); + ret = mmap_ret; + break; + } + ret = mmap_ret; + } + + if (!ret) + update_mem_usage(spa_size(spa), true, spa->is_hugepage, ori_node, spa->type); + sp_area_put_locked(spa); + up_write(&spg->rw_lock); + sp_group_put(spg); + + return ret ? ERR_PTR(ret) : (void *)(spa->va_start + (kc.kva - kc.kva_aligned)); +} +EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u); + +static int sp_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page; + struct sp_walk_data *sp_walk_data = walk->private; + + /* + * There exist a scene in DVPP where the pagetable is huge page but its + * vma doesn't record it, something like THP. + * So we cannot make out whether it is a hugepage map until we access the + * pmd here. If mixed size of pages appear, just return an error. + */ + if (pmd_huge(*pmd)) { + if (!sp_walk_data->is_page_type_set) { + sp_walk_data->is_page_type_set = true; + sp_walk_data->is_hugepage = true; + } else if (!sp_walk_data->is_hugepage) { + return -EFAULT; + } + + /* To skip pte level walk */ + walk->action = ACTION_CONTINUE; + + page = pmd_page(*pmd); + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + + return 0; + } + + if (!sp_walk_data->is_page_type_set) { + sp_walk_data->is_page_type_set = true; + sp_walk_data->is_hugepage = false; + } else if (sp_walk_data->is_hugepage) + return -EFAULT; + + sp_walk_data->pmd = pmd; + + return 0; +} + +static int sp_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page; + struct sp_walk_data *sp_walk_data = walk->private; + pmd_t *pmd = sp_walk_data->pmd; + +retry: + if (unlikely(!pte_present(*pte))) { + swp_entry_t entry; + spinlock_t *ptl = pte_lockptr(walk->mm, pmd); + + if (pte_none(*pte)) + goto no_page; + entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(entry)) + goto no_page; + + pte_unmap_unlock(pte, ptl); + migration_entry_wait(walk->mm, pmd, addr); + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + goto retry; + } + + page = pte_page(*pte); + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; + +no_page: + pr_debug("the page of addr %lx unexpectedly not in RAM\n", + (unsigned long)addr); + return -EFAULT; +} + +static int sp_test_walk(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* + * FIXME: The devmm driver uses remap_pfn_range() but actually there + * are associated struct pages, so they should use vm_map_pages() or + * similar APIs. Before the driver has been converted to correct APIs + * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as + * normal VMAs. + */ + return 0; +} + +static int sp_pte_hole(unsigned long start, unsigned long end, + int depth, struct mm_walk *walk) +{ + pr_debug("hole [%lx, %lx) appeared unexpectedly\n", + (unsigned long)start, (unsigned long)end); + return -EFAULT; +} + +static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = huge_ptep_get(ptep); + struct page *page = pte_page(pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(pte))) { + pr_debug("the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +/* + * __sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @mm: mm struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * + * the caller must hold mm->mmap_lock + * + * Notes for parameter alignment: + * When size == 0, let it be page_size, so that at least one page is walked. + * + * When size > 0, for convenience, usually the parameters of uva and + * size are not page aligned. There are four different alignment scenarios and + * we must handler all of them correctly. + * + * The basic idea is to align down uva and align up size so all the pages + * in range [uva, uva + size) are walked. However, there are special cases. + * + * Considering a 2M-hugepage addr scenario. Assuming the caller wants to + * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M + * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M. + * The traverse range will be [1000M, 1004M). Obviously, the final page for + * [1004M, 1004.5M) is not covered. + * + * To fix this problem, we need to walk an additional page, size should be + * ALIGN(uva+size) - uva_aligned + */ +static int __sp_walk_page_range(unsigned long uva, unsigned long size, + struct mm_struct *mm, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + struct vm_area_struct *vma; + unsigned long page_nr; + struct page **pages = NULL; + bool is_hugepage = false; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + struct mm_walk_ops sp_walk = {}; + + /* + * Here we also support non share pool memory in this interface + * because the caller can't distinguish whether a uva is from the + * share pool or not. It is not the best idea to do so, but currently + * it simplifies overall design. + * + * In this situation, the correctness of the parameters is mainly + * guaranteed by the caller. + */ + vma = find_vma(mm, uva); + if (!vma) { + pr_debug("u2k input uva %lx is invalid\n", (unsigned long)uva); + return -EINVAL; + } + if (is_vm_hugetlb_page(vma)) + is_hugepage = true; + + sp_walk.pte_hole = sp_pte_hole; + sp_walk.test_walk = sp_test_walk; + if (is_hugepage) { + sp_walk_data->is_hugepage = true; + sp_walk.hugetlb_entry = sp_hugetlb_entry; + page_size = PMD_SIZE; + } else { + sp_walk_data->is_hugepage = false; + sp_walk.pte_entry = sp_pte_entry; + sp_walk.pmd_entry = sp_pmd_entry; + } + + sp_walk_data->is_page_type_set = false; + sp_walk_data->page_count = 0; + sp_walk_data->page_size = page_size; + uva_aligned = ALIGN_DOWN(uva, page_size); + sp_walk_data->uva_aligned = uva_aligned; + if (size == 0) + size_aligned = page_size; + else + /* special alignment handling */ + size_aligned = ALIGN(uva + size, page_size) - uva_aligned; + + if (uva_aligned + size_aligned < uva_aligned) { + pr_err_ratelimited("overflow happened in walk page range\n"); + return -EINVAL; + } + + page_nr = size_aligned / page_size; + pages = kvmalloc_array(page_nr, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + pr_err_ratelimited("alloc page array failed in walk page range\n"); + return -ENOMEM; + } + sp_walk_data->pages = pages; + + ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned, + &sp_walk, sp_walk_data); + if (ret) { + while (sp_walk_data->page_count--) + put_page(pages[sp_walk_data->page_count]); + kvfree(pages); + sp_walk_data->pages = NULL; + } + + if (sp_walk_data->is_hugepage) + sp_walk_data->uva_aligned = ALIGN_DOWN(uva, PMD_SIZE); + + return ret; +} + +static void __sp_walk_page_free(struct sp_walk_data *data) +{ + int i = 0; + struct page *page; + + while (i < data->page_count) { + page = data->pages[i++]; + put_page(page); + } + + kvfree(data->pages); + /* prevent repeated release */ + data->page_count = 0; + data->pages = NULL; +} + +/** + * mg_sp_make_share_u2k() - Share user memory of a specified process to kernel. + * @uva: the VA of shared user memory + * @size: the size of shared user memory + * @tgid: the tgid of the specified process(Not currently in use) + * + * Return: + * * if success, return the starting kernel address of the shared memory. + * * if failed, return the pointer of -errno. + */ +void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid) +{ + int ret = 0; + struct mm_struct *mm = current->mm; + void *p = ERR_PTR(-ESRCH); + struct sp_walk_data sp_walk_data; + struct vm_struct *area; + + if (!sp_is_enabled()) + return ERR_PTR(-EOPNOTSUPP); + + check_interrupt_context(); + + if (mm == NULL) { + pr_err("u2k: kthread is not allowed\n"); + return ERR_PTR(-EPERM); + } + + down_write(&mm->mmap_lock); + ret = __sp_walk_page_range(uva, size, mm, &sp_walk_data); + if (ret) { + pr_err_ratelimited("walk page range failed %d\n", ret); + up_write(&mm->mmap_lock); + return ERR_PTR(ret); + } + + if (sp_walk_data.is_hugepage) + p = vmap_hugepage(sp_walk_data.pages, sp_walk_data.page_count, + VM_MAP, PAGE_KERNEL); + else + p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP, + PAGE_KERNEL); + up_write(&mm->mmap_lock); + + if (!p) { + pr_err("vmap(huge) in u2k failed\n"); + __sp_walk_page_free(&sp_walk_data); + return ERR_PTR(-ENOMEM); + } + + p = p + (uva - sp_walk_data.uva_aligned); + + /* + * kva p may be used later in k2u. Since p comes from uva originally, + * it's reasonable to add flag VM_USERMAP so that p can be remapped + * into userspace again. + */ + area = find_vm_area(p); + area->flags |= VM_USERMAP; + + kvfree(sp_walk_data.pages); + return p; +} +EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k); + +/* No possible concurrent protection, take care when use */ +static int sp_unshare_kva(unsigned long kva, unsigned long size) +{ + unsigned long addr, kva_aligned; + struct page *page; + unsigned long size_aligned; + unsigned long step; + bool is_hugepage = true; + int ret; + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + kva_aligned = ALIGN_DOWN(kva, PMD_SIZE); + size_aligned = ALIGN(kva + size, PMD_SIZE) - kva_aligned; + step = PMD_SIZE; + } else if (ret == 0) { + kva_aligned = ALIGN_DOWN(kva, PAGE_SIZE); + size_aligned = ALIGN(kva + size, PAGE_SIZE) - kva_aligned; + step = PAGE_SIZE; + is_hugepage = false; + } else { + pr_err_ratelimited("check vmap hugepage failed %d\n", ret); + return -EINVAL; + } + + if (kva_aligned + size_aligned < kva_aligned) { + pr_err_ratelimited("overflow happened in unshare kva\n"); + return -EINVAL; + } + + for (addr = kva_aligned; addr < (kva_aligned + size_aligned); addr += step) { + page = vmalloc_to_page((void *)addr); + if (page) + put_page(page); + else + WARN(1, "vmalloc %pK to page/hugepage failed\n", + (void *)addr); + } + + vunmap((void *)kva_aligned); + + return 0; +} + +/** + * mg_sp_unshare() - Unshare the kernel or user memory which shared by calling + * sp_make_share_{k2u,u2k}(). + * @va: the specified virtual address of memory + * @size: the size of unshared memory + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + * + * Return: 0 for success, -errno on failure. + */ +int mg_sp_unshare(unsigned long va, unsigned long size, int spg_id) +{ + int ret = 0; + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + check_interrupt_context(); + + if (current->flags & PF_KTHREAD) + return -EINVAL; + + if (va < TASK_SIZE) { + /* All the spa are aligned to 2M. */ + spg_id = (spg_id == SPG_ID_NONE) ? SPG_ID_DEFAULT : spg_id; + ret = sp_free_inner(ALIGN_DOWN(va, PMD_SIZE), spg_id, false); + } else if (va >= PAGE_OFFSET) { + /* kernel address */ + ret = sp_unshare_kva(va, size); + } else { + /* regard user and kernel address ranges as bad address */ + pr_debug("unshare addr %lx is not a user or kernel addr\n", (unsigned long)va); + ret = -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(mg_sp_unshare); + +/** + * mg_sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @tsk: task struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * + * Return: 0 for success, -errno on failure. + * + * When return 0, sp_walk_data describing [uva, uva+size) can be used. + * When return -errno, information in sp_walk_data is useless. + */ +int mg_sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + struct mm_struct *mm; + int ret = 0; + + if (!sp_is_enabled()) + return -EOPNOTSUPP; + + check_interrupt_context(); + + if (unlikely(!sp_walk_data)) { + pr_err_ratelimited("null pointer when walk page range\n"); + return -EINVAL; + } + if (!tsk || (tsk->flags & PF_EXITING)) + return -ESRCH; + + get_task_struct(tsk); + mm = get_task_mm(tsk); + if (!mm) { + put_task_struct(tsk); + return -ESRCH; + } + + down_write(&mm->mmap_lock); + ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); + up_write(&mm->mmap_lock); + + mmput(mm); + put_task_struct(tsk); + + return ret; +} +EXPORT_SYMBOL_GPL(mg_sp_walk_page_range); + +/** + * mg_sp_walk_page_free() - Free the sp_walk_data structure. + * @sp_walk_data: a structure of a page pointer array to be freed. + */ +void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ + if (!sp_is_enabled()) + return; + + check_interrupt_context(); + + if (!sp_walk_data) + return; + + __sp_walk_page_free(sp_walk_data); +} +EXPORT_SYMBOL_GPL(mg_sp_walk_page_free); + +static bool is_sp_dynamic_dvpp_addr(unsigned long addr); +/** + * mg_sp_config_dvpp_range() - User can config the share pool start address + * of each Da-vinci device. + * @start: the value of share pool start + * @size: the value of share pool + * @device_id: the num of Da-vinci device + * @tgid: the tgid of device process + * + * Return true for success. + * Return false if parameter invalid or has been set up. + * This functuon has no concurrent problem. + */ +bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int tgid) +{ + int ret; + bool err = false; + struct task_struct *tsk; + struct mm_struct *mm; + struct sp_group *spg; + struct sp_mapping *spm; + unsigned long default_start; + + if (!sp_is_enabled()) + return false; + + /* NOTE: check the start address */ + if (tgid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || + device_id < 0 || device_id >= MAX_DEVID || !is_online_node_id(device_id) + || !is_sp_dynamic_dvpp_addr(start) || !is_sp_dynamic_dvpp_addr(start + size - 1)) + return false; + + ret = get_task(tgid, &tsk); + if (ret) + return false; + + mm = get_task_mm(tsk->group_leader); + if (!mm) + goto put_task; + + spg = sp_get_local_group(tsk, mm); + if (IS_ERR(spg)) + goto put_mm; + + spm = spg->mapping[SP_MAPPING_DVPP]; + default_start = MMAP_SHARE_POOL_DVPP_START + device_id * MMAP_SHARE_POOL_16G_SIZE; + /* The dvpp range of each group can be configured only once */ + if (spm->start[device_id] != default_start) + goto put_spg; + + spm->start[device_id] = start; + spm->end[device_id] = start + size; + + err = true; + +put_spg: + sp_group_put(spg); +put_mm: + mmput(mm); +put_task: + put_task_struct(tsk); + + return err; +} +EXPORT_SYMBOL_GPL(mg_sp_config_dvpp_range); + +static bool is_sp_reserve_addr(unsigned long addr) +{ + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; +} + +/* + * | 16G host | 16G device | ... | | + * ^ + * | + * MMAP_SHARE_POOL_DVPP_BASE + 16G * 64 + * We only check the device regions. + */ +static bool is_sp_dynamic_dvpp_addr(unsigned long addr) +{ + if (addr < MMAP_SHARE_POOL_DYNAMIC_DVPP_BASE || addr >= MMAP_SHARE_POOL_DYNAMIC_DVPP_END) + return false; + + return (addr - MMAP_SHARE_POOL_DYNAMIC_DVPP_BASE) & MMAP_SHARE_POOL_16G_SIZE; +} + +/** + * mg_is_sharepool_addr() - Check if a user memory address belongs to share pool. + * @addr: the userspace address to be checked. + * + * Return true if addr belongs to share pool, or false vice versa. + */ +bool mg_is_sharepool_addr(unsigned long addr) +{ + return sp_is_enabled() && + ((is_sp_reserve_addr(addr) || is_sp_dynamic_dvpp_addr(addr))); +} +EXPORT_SYMBOL_GPL(mg_is_sharepool_addr); + +/*** Statistical and maintenance functions ***/ + +static void get_mm_rss_info(struct mm_struct *mm, unsigned long *anon, + unsigned long *file, unsigned long *shmem, unsigned long *total_rss) +{ + *anon = get_mm_counter(mm, MM_ANONPAGES); + *file = get_mm_counter(mm, MM_FILEPAGES); + *shmem = get_mm_counter(mm, MM_SHMEMPAGES); + *total_rss = *anon + *file + *shmem; +} + +static void get_process_sp_res(struct sp_group_master *master, + long *sp_res_out, long *sp_res_nsize_out) +{ + struct sp_group *spg; + struct sp_group_node *spg_node; + + *sp_res_out = 0; + *sp_res_nsize_out = 0; + + list_for_each_entry(spg_node, &master->group_head, group_node) { + spg = spg_node->spg; + *sp_res_out += meminfo_alloc_sum_byKB(&spg->meminfo); + *sp_res_nsize_out += byte2kb(atomic64_read(&spg->meminfo.alloc_nsize)); + } +} + +/* + * Statistics of RSS has a maximum 64 pages deviation (256KB). + * Please check_sync_rss_stat(). + */ +static void get_process_non_sp_res(unsigned long total_rss, unsigned long shmem, + long sp_res_nsize, long *non_sp_res_out, long *non_sp_shm_out) +{ + long non_sp_res, non_sp_shm; + + non_sp_res = page2kb(total_rss) - sp_res_nsize; + non_sp_res = non_sp_res < 0 ? 0 : non_sp_res; + non_sp_shm = page2kb(shmem) - sp_res_nsize; + non_sp_shm = non_sp_shm < 0 ? 0 : non_sp_shm; + + *non_sp_res_out = non_sp_res; + *non_sp_shm_out = non_sp_shm; +} + +static void print_process_prot(struct seq_file *seq, unsigned long prot) +{ + if (prot == PROT_READ) + seq_puts(seq, "R"); + else if (prot == (PROT_READ | PROT_WRITE)) + seq_puts(seq, "RW"); + else + seq_puts(seq, "-"); +} + +int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + struct sp_group_master *master; + struct sp_meminfo *meminfo; + struct sp_group_node *spg_node; + unsigned long anon, file, shmem, total_rss; + long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + + if (!sp_is_enabled()) + return 0; + + mm = get_task_mm(task); + if (!mm) + return 0; + + down_read(&sp_global_sem); + down_read(&mm->mmap_lock); + master = mm->sp_group_master; + if (!master) + goto out; + + get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); + meminfo = &master->meminfo; + get_process_sp_res(master, &sp_res, &sp_res_nsize); + get_process_non_sp_res(total_rss, shmem, sp_res_nsize, + &non_sp_res, &non_sp_shm); + + seq_puts(m, "Share Pool Aggregate Data of This Process\n\n"); + seq_printf(m, "%-8s %-16s %-9s %-9s %-9s %-10s %-10s %-8s\n", + "PID", "COMM", "SP_ALLOC", "SP_K2U", "SP_RES", "Non-SP_RES", + "Non-SP_Shm", "VIRT"); + seq_printf(m, "%-8d %-16s %-9ld %-9ld %-9ld %-10ld %-10ld %-8ld\n", + master->tgid, master->comm, + meminfo_alloc_sum_byKB(meminfo), + meminfo_k2u_size(meminfo), + sp_res, non_sp_res, non_sp_shm, + page2kb(mm->total_vm)); + + seq_puts(m, "\n\nProcess in Each SP Group\n\n"); + seq_printf(m, "%-8s %-9s %-9s %-9s %-4s\n", + "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "PROT"); + + list_for_each_entry(spg_node, &master->group_head, group_node) { + seq_printf(m, "%-8d %-9ld %-9ld %-9ld ", + spg_node->spg->id, + meminfo_alloc_sum_byKB(&spg_node->meminfo), + meminfo_k2u_size(&spg_node->meminfo), + meminfo_alloc_sum_byKB(&spg_node->spg->meminfo)); + print_process_prot(m, spg_node->prot); + seq_putc(m, '\n'); + } + +out: + up_read(&mm->mmap_lock); + up_read(&sp_global_sem); + mmput(mm); + return 0; +} + +static void spa_stat_of_mapping_show(struct seq_file *seq, struct sp_mapping *spm) +{ + struct rb_node *node; + struct sp_area *spa; + + spin_lock(&spm->sp_mapping_lock); + for (node = rb_first(&spm->area_root); node; node = rb_next(node)) { + spa = rb_entry(node, struct sp_area, rb_node); + + seq_printf(seq, "%-10d 0x%-14lx 0x%-14lx %-10ld ", + spa->spg->id, spa->va_start, spa->va_end, byte2kb(spa->real_size)); + + switch (spa->type) { + case SPA_TYPE_ALLOC: + seq_printf(seq, "%-7s ", "ALLOC"); + break; + case SPA_TYPE_K2TASK: + seq_printf(seq, "%-7s ", "TASK"); + break; + case SPA_TYPE_K2SPG: + seq_printf(seq, "%-7s ", "SPG"); + break; + default: + /* usually impossible, perhaps a developer's mistake */ + break; + } + + if (spa->is_hugepage) + seq_printf(seq, "%-5s ", "Y"); + else + seq_printf(seq, "%-5s ", "N"); + + seq_printf(seq, "%-8d ", spa->applier); + seq_printf(seq, "%-8d\n", atomic_read(&spa->use_count)); + + } + spin_unlock(&spm->sp_mapping_lock); +} + +static void spa_ro_stat_show(struct seq_file *seq) +{ + spa_stat_of_mapping_show(seq, sp_mapping_ro); +} + +static void spa_normal_stat_show(struct seq_file *seq) +{ + spa_stat_of_mapping_show(seq, sp_mapping_normal); +} + +static void spa_dvpp_stat_show(struct seq_file *seq) +{ + struct sp_mapping *spm; + + mutex_lock(&spm_list_lock); + list_for_each_entry(spm, &spm_dvpp_list, spm_node) + spa_stat_of_mapping_show(seq, spm); + mutex_unlock(&spm_list_lock); +} + + +static void spa_overview_show(struct seq_file *seq) +{ + s64 total_num, alloc_num, k2u_task_num, k2u_spg_num; + s64 total_size, alloc_size, k2u_task_size, k2u_spg_size; + s64 dvpp_size, dvpp_va_size; + + if (!sp_is_enabled()) + return; + + alloc_num = atomic64_read(&spa_stat.alloc_num); + k2u_task_num = atomic64_read(&spa_stat.k2u_task_num); + k2u_spg_num = atomic64_read(&spa_stat.k2u_spg_num); + alloc_size = atomic64_read(&spa_stat.alloc_size); + k2u_task_size = atomic64_read(&spa_stat.k2u_task_size); + k2u_spg_size = atomic64_read(&spa_stat.k2u_spg_size); + dvpp_size = atomic64_read(&spa_stat.dvpp_size); + dvpp_va_size = atomic64_read(&spa_stat.dvpp_va_size); + total_num = alloc_num + k2u_task_num + k2u_spg_num; + total_size = alloc_size + k2u_task_size + k2u_spg_size; + + SEQ_printf(seq, "Spa total num %lld.\n", total_num); + SEQ_printf(seq, "Spa alloc num %lld, k2u(task) num %lld, k2u(spg) num %lld.\n", + alloc_num, k2u_task_num, k2u_spg_num); + SEQ_printf(seq, "Spa total size: %13lld KB\n", byte2kb(total_size)); + SEQ_printf(seq, "Spa alloc size: %13lld KB\n", byte2kb(alloc_size)); + SEQ_printf(seq, "Spa k2u(task) size: %13lld KB\n", byte2kb(k2u_task_size)); + SEQ_printf(seq, "Spa k2u(spg) size: %13lld KB\n", byte2kb(k2u_spg_size)); + SEQ_printf(seq, "Spa dvpp size: %13lld KB\n", byte2kb(dvpp_size)); + SEQ_printf(seq, "Spa dvpp va size: %13lld MB\n", byte2mb(dvpp_va_size)); + SEQ_printf(seq, "\n"); +} + +static int spg_info_show(int id, void *p, void *data) +{ + struct sp_group *spg = p; + struct seq_file *seq = data; + + if (id >= SPG_ID_LOCAL_MIN && id <= SPG_ID_LOCAL_MAX) + return 0; + + SEQ_printf(seq, "Group %6d ", id); + + down_read(&spg->rw_lock); + SEQ_printf(seq, "size: %lld KB, spa num: %d, total alloc: %ld KB, normal alloc: %lld KB, " + "huge alloc: %lld KB\n", + byte2kb(meminfo_total_size(&spg->meminfo)), + atomic_read(&spg->spa_num), + meminfo_alloc_sum_byKB(&spg->meminfo), + byte2kb(atomic64_read(&spg->meminfo.alloc_nsize)), + byte2kb(atomic64_read(&spg->meminfo.alloc_hsize))); + up_read(&spg->rw_lock); + + return 0; +} + +static void spg_overview_show(struct seq_file *seq) +{ + if (!sp_is_enabled()) + return; + + SEQ_printf(seq, "Share pool total size: %lld KB, spa total num: %d.\n", + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); + + down_read(&sp_global_sem); + idr_for_each(&sp_group_idr, spg_info_show, seq); + up_read(&sp_global_sem); + + SEQ_printf(seq, "\n"); +} + +static bool should_show_statistics(void) +{ + if (!capable(CAP_SYS_ADMIN)) + return false; + + if (task_active_pid_ns(current) != &init_pid_ns) + return false; + + return true; +} + +static int spa_stat_show(struct seq_file *seq, void *offset) +{ + if (!should_show_statistics()) + return -EPERM; + + spg_overview_show(seq); + spa_overview_show(seq); + /* print the file header */ + seq_printf(seq, "%-10s %-16s %-16s %-10s %-7s %-5s %-8s %-8s\n", + "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref"); + spa_ro_stat_show(seq); + spa_normal_stat_show(seq); + spa_dvpp_stat_show(seq); + return 0; +} + +static int proc_usage_by_group(int id, void *p, void *data) +{ + struct sp_group *spg = p; + struct seq_file *seq = data; + struct sp_group_node *spg_node; + struct mm_struct *mm; + struct sp_group_master *master; + int tgid; + unsigned long anon, file, shmem, total_rss; + + down_read(&spg->rw_lock); + list_for_each_entry(spg_node, &spg->proc_head, proc_node) { + master = spg_node->master; + mm = master->mm; + tgid = master->tgid; + + get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); + + seq_printf(seq, "%-8d ", tgid); + seq_printf(seq, "%-8d ", id); + seq_printf(seq, "%-9ld %-9ld %-9ld %-8ld %-7ld %-7ld ", + meminfo_alloc_sum_byKB(&spg_node->meminfo), + meminfo_k2u_size(&spg_node->meminfo), + meminfo_alloc_sum_byKB(&spg_node->spg->meminfo), + page2kb(mm->total_vm), page2kb(total_rss), + page2kb(shmem)); + print_process_prot(seq, spg_node->prot); + seq_putc(seq, '\n'); + } + up_read(&spg->rw_lock); + cond_resched(); + + return 0; +} + +static int proc_group_usage_show(struct seq_file *seq, void *offset) +{ + if (!should_show_statistics()) + return -EPERM; + + spg_overview_show(seq); + spa_overview_show(seq); + + /* print the file header */ + seq_printf(seq, "%-8s %-8s %-9s %-9s %-9s %-8s %-7s %-7s %-4s\n", + "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", + "VIRT", "RES", "Shm", "PROT"); + + down_read(&sp_global_sem); + idr_for_each(&sp_group_idr, proc_usage_by_group, seq); + up_read(&sp_global_sem); + + return 0; +} + +static int proc_usage_show(struct seq_file *seq, void *offset) +{ + struct sp_group_master *master = NULL; + unsigned long anon, file, shmem, total_rss; + long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + struct sp_meminfo *meminfo; + + if (!should_show_statistics()) + return -EPERM; + + seq_printf(seq, "%-8s %-16s %-9s %-9s %-9s %-10s %-10s %-8s\n", + "PID", "COMM", "SP_ALLOC", "SP_K2U", "SP_RES", "Non-SP_RES", + "Non-SP_Shm", "VIRT"); + + down_read(&sp_global_sem); + mutex_lock(&master_list_lock); + list_for_each_entry(master, &master_list, list_node) { + meminfo = &master->meminfo; + get_mm_rss_info(master->mm, &anon, &file, &shmem, &total_rss); + get_process_sp_res(master, &sp_res, &sp_res_nsize); + get_process_non_sp_res(total_rss, shmem, sp_res_nsize, + &non_sp_res, &non_sp_shm); + seq_printf(seq, "%-8d %-16s %-9ld %-9ld %-9ld %-10ld %-10ld %-8ld\n", + master->tgid, master->comm, + meminfo_alloc_sum_byKB(meminfo), + meminfo_k2u_size(meminfo), + sp_res, non_sp_res, non_sp_shm, + page2kb(master->mm->total_vm)); + } + mutex_unlock(&master_list_lock); + up_read(&sp_global_sem); + + return 0; +} + +static void __init proc_sharepool_init(void) +{ + if (!proc_mkdir("sharepool", NULL)) + return; + + proc_create_single_data("sharepool/spa_stat", 0400, NULL, spa_stat_show, NULL); + proc_create_single_data("sharepool/proc_stat", 0400, NULL, proc_group_usage_show, NULL); + proc_create_single_data("sharepool/proc_overview", 0400, NULL, proc_usage_show, NULL); +} + +/*** End of tatistical and maintenance functions ***/ + +void sp_mm_clean(struct mm_struct *mm) +{ + struct sp_meminfo *meminfo; + long alloc_size, k2u_size; + /* lockless visit */ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group_node *spg_node, *tmp; + struct sp_group *spg; + + if (!sp_is_enabled() || !master) + return; + + /* + * There are two basic scenarios when a process in the share pool is + * exiting but its share pool memory usage is not 0. + * 1. Process A called sp_alloc(), but it terminates without calling + * sp_free(). Then its share pool memory usage is a positive number. + * 2. Process A never called sp_alloc(), and process B in the same spg + * called sp_alloc() to get an addr u. Then A gets u somehow and + * called sp_free(u). Now A's share pool memory usage is a negative + * number. Notice B's memory usage will be a positive number. + * + * We decide to print an info when seeing both of the scenarios. + * + * A process not in an sp group doesn't need to print because there + * wont't be any memory which is not freed. + */ + meminfo = &master->meminfo; + alloc_size = meminfo_alloc_sum(meminfo); + k2u_size = atomic64_read(&meminfo->k2u_size); + if (alloc_size != 0 || k2u_size != 0) + pr_info("process %s(%d) exits. It applied %ld aligned KB, k2u shared %ld aligned KB\n", + master->comm, master->tgid, + byte2kb(alloc_size), byte2kb(k2u_size)); + + down_write(&sp_global_sem); + list_for_each_entry_safe(spg_node, tmp, &master->group_head, group_node) { + spg = spg_node->spg; + + down_write(&spg->rw_lock); + + list_del(&spg_node->proc_node); + spg->proc_num--; + list_del(&spg_node->group_node); + master->group_num--; + + up_write(&spg->rw_lock); + + mmdrop(mm); + sp_group_put_locked(spg); + kfree(spg_node); + } + up_write(&sp_global_sem); + + sp_del_group_master(master); + + kfree(master); +} + +DEFINE_STATIC_KEY_FALSE(share_pool_enabled_key); + +static int __init enable_share_pool(char *s) +{ + static_branch_enable(&share_pool_enabled_key); + pr_info("Ascend enable share pool features via bootargs\n"); + + return 1; +} +__setup("enable_ascend_share_pool", enable_share_pool); + +static int __init share_pool_init(void) +{ + if (!sp_is_enabled()) + return 0; + + sp_mapping_normal = sp_mapping_create(SP_MAPPING_NORMAL); + if (!sp_mapping_normal) + goto fail; + atomic_inc(&sp_mapping_normal->user); + + sp_mapping_ro = sp_mapping_create(SP_MAPPING_RO); + if (!sp_mapping_ro) + goto free_normal; + atomic_inc(&sp_mapping_ro->user); + + proc_sharepool_init(); + + return 0; + +free_normal: + kfree(sp_mapping_normal); +fail: + pr_err("Ascend share pool initialization failed\n"); + static_branch_disable(&share_pool_enabled_key); + return 1; +} +late_initcall(share_pool_init); diff --git a/mm/util.c b/mm/util.c index 6593ad7b84eea0186d97d7985a55f6d8e825053e..cfbd5bec0cf80052cb49466c3252dc434cbdb571 100644 --- a/mm/util.c +++ b/mm/util.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -428,6 +429,9 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP; + if (sp_is_enabled()) + return ALIGN_DOWN(MMAP_SHARE_POOL_START - rnd, PAGE_SIZE); + return PAGE_ALIGN(STACK_TOP - gap - rnd); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d13d71687d73417c7b29e5383d99944f6e50026..cf6abcb8575166e9f8bf1df85efc1511f4d58f8e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3213,7 +3213,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes(); - if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) + if (arch_vmap_pmd_supported(prot) && (size_per_node >= PMD_SIZE || + (IS_ENABLED(CONFIG_EXTEND_HUGEPAGE_MAPPING) && + (vm_flags & VM_HUGE_PAGES)))) shift = PMD_SHIFT; else shift = arch_vmap_pte_supported_shift(size_per_node); @@ -3296,7 +3298,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return area->addr; fail: - if (shift > PAGE_SHIFT) { + if (shift > PAGE_SHIFT && !(IS_ENABLED(CONFIG_EXTEND_HUGEPAGE_MAPPING) && + (vm_flags & VM_HUGE_PAGES))) { shift = PAGE_SHIFT; align = real_align; size = real_size; @@ -4236,6 +4239,204 @@ bool vmalloc_dump_obj(void *object) } #endif +#ifdef CONFIG_EXTEND_HUGEPAGE_MAPPING +/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL, + VM_ALLOW_HUGE_VMAP | VM_HUGE_PAGES, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_ALLOW_HUGE_VMAP | VM_USERMAP | VM_HUGE_PAGES, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + +static int vmap_hugepages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> page_shift; + + for (i = 0; i < nr; i++) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_hugepages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + int err; + + err = vmap_hugepages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + + return err; +} + +/** + * vmap_hugepage - map an array of huge pages into virtually contiguous space + * @pages: array of huge page pointers (only the header) + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long size; /* In bytes */ + + might_sleep(); + + if (count > totalram_pages()) + return NULL; + + size = (unsigned long)count << PMD_SHIFT; + area = __get_vm_area_node(size, PMD_SIZE, PMD_SHIFT, flags | VM_HUGE_PAGES, + VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); + if (!area) + return NULL; + + if (vmap_hugepages_range((unsigned long)area->addr, + (unsigned long)area->addr + size, prot, + pages, PMD_SHIFT) < 0) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap_hugepage); + +/** + * remap_vmalloc_hugepage_range_partial - map vmalloc hugepages + * to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc hugepage kernel memory + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long pgoff, unsigned long size) +{ + struct vm_struct *area; + unsigned long off; + unsigned long end_index; + + if (check_shl_overflow(pgoff, PMD_SHIFT, &off)) + return -EINVAL; + + size = ALIGN(size, PMD_SIZE); + + if (!IS_ALIGNED(uaddr, PMD_SIZE) || !IS_ALIGNED((unsigned long)kaddr, PMD_SIZE)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (check_add_overflow(size, off, &end_index) || + end_index > get_vm_area_size(area)) + return -EINVAL; + kaddr += off; + + do { + struct page *page = vmalloc_to_page(kaddr); + int ret; + + ret = hugetlb_insert_hugepage_pte_by_pa(vma->vm_mm, uaddr, + vma->vm_page_prot, page_to_phys(page)); + if (ret) + return ret; + + uaddr += PMD_SIZE; + kaddr += PMD_SIZE; + size -= PMD_SIZE; + } while (size > 0); + + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range_partial); + +/** + * remap_vmalloc_hugepage_range - map vmalloc hugepages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of hugepages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_hugepage_range_partial(vma, vma->vm_start, + addr, pgoff, + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range); +#endif + #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock)