diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8c87a2e0b660c1dbfcf74cbd455ec5728962bd3a..0421a7a368a28012a2b2bab86e5f92102e2baf3b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -18,6 +18,8 @@ void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL void cr4_update_irqsoff(unsigned long set, unsigned long clear); +bool cr_pinning_enabled(void); + unsigned long cr4_read_shadow(void); /* Set in this cpu's CR4. */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b58b5ba4a70f9382e579852b6982b4a89bb9be5f..8deb567c92a6d58ba337a9d2dc1419e0d6bea3d4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -442,8 +442,15 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); + static unsigned long cr4_pinned_bits __ro_after_init; +bool cr_pinning_enabled(void) +{ + return static_key_enabled(&cr_pinning); +} +EXPORT_SYMBOL(cr_pinning_enabled); + void native_write_cr0(unsigned long val) { unsigned long bits_missing = 0; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6a432e59615923f5557a6d67757461db6dd6d8ec..d765d199de1e09bda4e4d2b5dfa28027ce9d73b4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1215,6 +1215,13 @@ void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + /* + * altmap range has been offline succeeded and altmap vmemmap + * has replaced with allocated memory from buddy. + */ + if (altmap && !altmap->alloc) + altmap = NULL; + __remove_pages(start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 67eef614d7925349eeee8a73fb2376ceaadad062..a82db349a3a8e9ce7a7b247176cdcf888774f253 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -593,6 +593,21 @@ struct memory_block *find_memory_block(struct mem_section *section) return find_memory_block_by_id(block_id); } +unsigned long get_memory_block_vmemmap_pages(unsigned long block_id) +{ + struct memory_block *mem = find_memory_block_by_id(block_id); + unsigned long nr_vmemmap_size; + + if (!mem || !mem->nr_vmemmap_pages) + return 0; + + nr_vmemmap_size = mem->nr_vmemmap_pages << PAGE_SHIFT; + put_device(&mem->dev); + + return nr_vmemmap_size; +} +EXPORT_SYMBOL(get_memory_block_vmemmap_pages); + static struct attribute *memory_memblk_attrs[] = { &dev_attr_phys_index.attr, &dev_attr_state.attr, diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d2a7da2831d9f43d704ca4affde7800a25172dac..00f6b6301ffbc367c800bb1b8065b9a73cec46c0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -856,6 +856,7 @@ static int virtblk_probe(struct virtio_device *vdev) u32 v, blk_size, max_size, sg_elems, opt_io_size; u16 min_io_size; u8 physical_block_exp, alignment_offset; + unsigned int queue_depth; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", @@ -907,16 +908,18 @@ static int virtblk_probe(struct virtio_device *vdev) } /* Default queue sizing is to fill the ring. */ - if (!virtblk_queue_depth) { - virtblk_queue_depth = vblk->vqs[0].vq->num_free; + if (likely(!virtblk_queue_depth)) { + queue_depth = vblk->vqs[0].vq->num_free; /* ... but without indirect descs, we use 2 descs per req */ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - virtblk_queue_depth /= 2; + queue_depth /= 2; + } else { + queue_depth = virtblk_queue_depth; } memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); vblk->tag_set.ops = &virtio_mq_ops; - vblk->tag_set.queue_depth = virtblk_queue_depth; + vblk->tag_set.queue_depth = queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; vblk->tag_set.cmd_size = diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 94c2b556cf9728ea66973f97b0d0222c97b36da5..c2953f7924cc0bd5dfb3cc7ad261d50b890825ab 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -94,6 +94,14 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) } #endif +static bool devmem_readonly; +static int __init devmem_enable_readonly(char *val) +{ + devmem_readonly = true; + return 0; +} +early_param("devmem.enable_readonly", devmem_enable_readonly); + #ifndef unxlate_dev_mem_ptr #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr void __weak unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) @@ -213,6 +221,9 @@ static ssize_t write_mem(struct file *file, const char __user *buf, if (!valid_phys_addr_range(p, count)) return -EFAULT; + if (devmem_readonly) + return -EPERM; + written = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED @@ -375,6 +386,11 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma) size_t size = vma->vm_end - vma->vm_start; phys_addr_t offset = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; + if (devmem_readonly) { + vma->vm_flags &= ~(VM_WRITE | VM_SHARED); + vma_set_page_prot(vma); + } + /* Does it even fit in phys_addr_t? */ if (offset >> PAGE_SHIFT != vma->vm_pgoff) return -EINVAL; diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index e8532ab519f59664d77213fee4dbf0508618bf6e..4bb1a077e54a6233d0f379d5d21981dec1ee7e35 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -130,7 +130,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) * this as RAM automatically. */ rc = add_memory_driver_managed(data->mgid, range.start, - range_len(&range), kmem_name, MHP_NID_IS_MGID); + range_len(&range), kmem_name, MHP_NID_IS_MGID, 0); if (rc) { dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index e0b0633c5c812d449ac04487f2a37352618f27d5..e0ae99cf39fb5cc2c19f85a6d1eff0619cc0d031 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -613,6 +613,7 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, uint64_t size) { int rc; + mhp_t extra_flags = 0; /* * When force-unloading the driver and we still have memory added to @@ -629,8 +630,13 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, addr + size - 1); /* Memory might get onlined immediately. */ atomic64_add(size, &vm->offline_size); +#ifdef CONFIG_X86_64 + /* only support memmap_on_memory on sbm scenario */ + if (vm->in_sbm) + extra_flags |= MHP_MEMMAP_ON_MEMORY; +#endif rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, - MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); + MHP_MERGE_RESOURCE | MHP_NID_IS_MGID, extra_flags); if (rc) { atomic64_sub(size, &vm->offline_size); dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); @@ -870,13 +876,13 @@ static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, } static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, - unsigned long mb_id) + unsigned long mb_id, unsigned long nr_sb) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; int sb_id; - for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + for (sb_id = nr_sb; sb_id < vm->sbm.sbs_per_mb; sb_id++) { if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + @@ -886,13 +892,13 @@ static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, } static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, - unsigned long mb_id) + unsigned long mb_id, unsigned long nr_sb) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; int sb_id; - for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + for (sb_id = nr_sb; sb_id < vm->sbm.sbs_per_mb; sb_id++) { if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + @@ -942,19 +948,21 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, const unsigned long size = PFN_PHYS(mhp->nr_pages); int rc = NOTIFY_OK; unsigned long id; + unsigned long nr_vmemmap_size = 0; if (!virtio_mem_overlaps_range(vm, start, size)) return NOTIFY_DONE; if (vm->in_sbm) { id = virtio_mem_phys_to_mb_id(start); + nr_vmemmap_size = get_memory_block_vmemmap_pages(id); + /* - * In SBM, we add memory in separate memory blocks - we expect - * it to be onlined/offlined in the same granularity. Bail out - * if this ever changes. + * In SBM, we add memory in separate memory blocks, but vmemmap page + * can be added to the start of memory block, we still expect to + * online/offline the whole memory blocks in that case. */ - if (WARN_ON_ONCE(size != memory_block_size_bytes() || - !IS_ALIGNED(start, memory_block_size_bytes()))) + if (WARN_ON_ONCE(!IS_ALIGNED(start, vm->sbm.sb_size))) return NOTIFY_BAD; } else { id = virtio_mem_phys_to_bb_id(vm, start); @@ -986,7 +994,8 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, } vm->hotplug_active = true; if (vm->in_sbm) - virtio_mem_sbm_notify_going_offline(vm, id); + virtio_mem_sbm_notify_going_offline(vm, id, + nr_vmemmap_size / vm->sbm.sb_size); else virtio_mem_bbm_notify_going_offline(vm, id, mhp->start_pfn, @@ -1007,7 +1016,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, if (vm->in_sbm) virtio_mem_sbm_notify_offline(vm, id); - atomic64_add(size, &vm->offline_size); + atomic64_add(size + nr_vmemmap_size, &vm->offline_size); /* * Trigger the workqueue. Now that we have some offline memory, * maybe we can handle pending unplug requests. @@ -1022,7 +1031,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, if (vm->in_sbm) virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); - atomic64_sub(size, &vm->offline_size); + atomic64_sub(size + nr_vmemmap_size, &vm->offline_size); /* * Start adding more memory once we onlined half of our * threshold. Don't trigger if it's possibly due to our actipn @@ -1040,7 +1049,8 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, if (!vm->hotplug_active) break; if (vm->in_sbm) - virtio_mem_sbm_notify_cancel_offline(vm, id); + virtio_mem_sbm_notify_cancel_offline(vm, id, + nr_vmemmap_size / vm->sbm.sb_size); else virtio_mem_bbm_notify_cancel_offline(vm, id, mhp->start_pfn, @@ -1141,12 +1151,18 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) * Try to allocate a range, marking pages fake-offline, effectively * fake-offlining them. */ -static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) +static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages, bool map) { - const bool is_movable = page_zonenum(pfn_to_page(pfn)) == - ZONE_MOVABLE; int rc, retry_count; + /* + * map means the subblock represent the struct page of movable zone. + * the range of vmemmap pages will remap to new page to keep the + * page information for offline_pages. + */ + if (map) + return 0; + /* * TODO: We want an alloc_contig_range() mode that tries to allocate * harder (e.g., dealing with temporarily pinned pages, PCP), especially @@ -1160,7 +1176,7 @@ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) if (rc == -ENOMEM) /* whoops, out of memory */ return rc; - else if (rc && !is_movable) + else if (rc && page_zonenum(pfn_to_page(pfn)) != ZONE_MOVABLE) break; else if (rc) continue; @@ -1892,22 +1908,46 @@ static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, * * Will modify the state of the memory block. */ +static void virtio_mem_init_section(unsigned long start_pfn, unsigned long nr_pages) +{ + struct mem_section *ms; + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct page *memmap = pfn_to_page(start_pfn); + + ms = __nr_to_section(section_nr); + ms->section_mem_map = sparse_encode_mem_map(memmap, section_nr) + | SECTION_MARKED_PRESENT | SECTION_IS_ONLINE | SECTION_HAS_MEM_MAP; +} + static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, unsigned long mb_id, int sb_id, - int count) + int count, bool map) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); unsigned long start_pfn; + unsigned long block_addr = virtio_mem_mb_id_to_phys(mb_id); + unsigned long block_nr_pages = PFN_DOWN(memory_block_size_bytes()); + unsigned long block_start_pfn = PFN_DOWN(block_addr); + unsigned long block_start = (unsigned long)pfn_to_page(block_start_pfn); + unsigned long block_end = block_start + block_nr_pages * sizeof(struct page); + unsigned long altmap_pfn; int rc; + LIST_HEAD(vmemmap_pages); - start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->sbm.sb_size); + start_pfn = PFN_DOWN(block_addr + sb_id * vm->sbm.sb_size); - rc = virtio_mem_fake_offline(start_pfn, nr_pages); + rc = virtio_mem_fake_offline(start_pfn, nr_pages, map); if (rc) return rc; + if (map) { + /* Make sure that memblock will record the page mapping */ + if (vmemmap_remap_alloc(block_start, block_end, + VIRTIO_MEMMAP_COPY, GFP_KERNEL, &vmemmap_pages)) + return -ENOMEM; + } + /* Try to unplug the allocated memory */ rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); if (rc) { @@ -1916,6 +1956,17 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, return rc; } + if (map) { + /* Make sure that memblock has rebuild the page mapping */ + if (vmemmap_remap_alloc(block_start, block_end, + VIRTIO_MEMMAP_RESTORE, GFP_KERNEL, &vmemmap_pages)) + return -ENOMEM; + + for (altmap_pfn = block_start_pfn; altmap_pfn < block_start_pfn + nr_pages; + altmap_pfn += PAGES_PER_SECTION) + virtio_mem_init_section(altmap_pfn, PAGES_PER_SECTION); + } + switch (old_state) { case VIRTIO_MEM_SBM_MB_KERNEL: virtio_mem_sbm_set_mb_state(vm, mb_id, @@ -1945,12 +1996,16 @@ static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, uint64_t *nb_sb) { int rc, sb_id; + bool map = false; + unsigned long nr_vmemmap_size = get_memory_block_vmemmap_pages(mb_id); + unsigned long nr_vmemmap_sbs = nr_vmemmap_size / vm->sbm.sb_size; + int count_vmemmap = 0; /* If possible, try to unplug the complete block in one shot. */ if (*nb_sb >= vm->sbm.sbs_per_mb && virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, - vm->sbm.sbs_per_mb); + vm->sbm.sbs_per_mb, map); if (!rc) { *nb_sb -= vm->sbm.sbs_per_mb; goto unplugged; @@ -1967,7 +2022,15 @@ static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, if (sb_id < 0) break; - rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); + if (nr_vmemmap_size && sb_id < nr_vmemmap_sbs && + virtio_mem_sbm_test_sb_unplugged(vm, mb_id, nr_vmemmap_sbs, + vm->sbm.sbs_per_mb - nr_vmemmap_sbs)) { + map = true; + count_vmemmap++; + continue; + } + + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1, map); if (rc == -EBUSY) continue; else if (rc) @@ -1975,6 +2038,12 @@ static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, *nb_sb -= 1; } + /* unplug the vmemmap of the whole memblock if it exists. */ + if (map) { + virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id + 1, count_vmemmap, map); + *nb_sb -= count_vmemmap; + } + unplugged: /* * Once all subblocks of a memory block were unplugged, offline and @@ -2114,7 +2183,7 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, if (!page) continue; - rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); + rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION, false); if (rc) { end_pfn = pfn; goto rollback_safe_unplug; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 8db96b5e725360463f7105461e18f05898421ec8..c79072df4f87664ec581364a7c11154f3f4f4498 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -336,7 +336,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE); + rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE, 0); unlock_device_hotplug(); mutex_lock(&balloon_mutex); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 72c8cf8662f59d6b37b3e9e8eec12aa64a2f8fa7..9d637b1cabe5559d1b0e5f1a835c76a32b34ce46 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -97,6 +97,18 @@ enum transparent_hugepage_flag { #endif }; +#ifdef CONFIG_HUGETEXT +#define BIT_SHIFT_THP_TEXT_FILE 0 +#define BIT_SHIFT_THP_TEXT_ANON 1 +#define BIT_SHIFT_THP_TEXT_FILE_DIRECT 2 + +#define BIT_MASK_THP_TEXT_FILE (1 << BIT_SHIFT_THP_TEXT_FILE) +#define BIT_MASK_THP_TEXT_ANON (1 << BIT_SHIFT_THP_TEXT_ANON) +#define BIT_MASK_THP_TEXT_FILE_DIRECT (1 << BIT_SHIFT_THP_TEXT_FILE_DIRECT) +#define BIT_MASK_THP_TEXT_ALL (BIT_MASK_THP_TEXT_FILE | \ + BIT_MASK_THP_TEXT_ANON | BIT_MASK_THP_TEXT_FILE_DIRECT) +#endif + struct kobject; struct kobj_attribute; diff --git a/include/linux/memory.h b/include/linux/memory.h index cbcc43ad2b97721b7f6aca6f63e6693a1adc710f..4ddc3b960ae95f266fedcfa249949552695d6bac 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -136,6 +136,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); +extern unsigned long get_memory_block_vmemmap_pages(unsigned long block_id); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 657e0511ed4eea10be143ec8b734fd64e1568440..874abfbf215fbeb517e33ee74a49c9ec5c62986a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -197,6 +197,7 @@ extern u64 max_mem_size; extern int memhp_online_type_from_str(const char *str); +extern bool skip_set_contiguous; /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ @@ -331,10 +332,10 @@ extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory_resource(int nid, struct resource *resource, - mhp_t mhp_flags); + mhp_t mhp_flags, mhp_t extra_flags); extern int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name, - mhp_t mhp_flags); + mhp_t mhp_flags, mhp_t extra_flags); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype); diff --git a/include/linux/mm.h b/include/linux/mm.h index e09f994c61b0c1ac1c3677e42a16d0fe6c0f347b..5531b4543dbc63e7fcf61437339b86c51c64b32a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1789,6 +1789,43 @@ extern int mlock_fixup(struct vm_area_struct *vma, */ #define ZAP_ZEROPAGE 0x01 +/* + * virtio use the following special value to remapping memmap. + */ +#define VIRTIO_MEMMAP_RESTORE (-1UL) +#define VIRTIO_MEMMAP_COPY (-2UL) +#ifdef CONFIG_VIRTIO_MEM +static inline bool virtio_is_use_memmap(unsigned long addr) +{ + return addr == VIRTIO_MEMMAP_RESTORE || addr == VIRTIO_MEMMAP_COPY; +} + +static inline bool virtio_memmap_restore(unsigned long addr) +{ + return addr == VIRTIO_MEMMAP_RESTORE; +} + +static inline bool virtio_memmap_copy(unsigned long addr) +{ + return addr == VIRTIO_MEMMAP_COPY; +} +#else +static inline bool virtio_is_use_memmap(unsigned long addr) +{ + return false; +} + +static inline bool virtio_memmap_restore(unsigned long addr) +{ + return false; +} + +static inline bool virtio_memmap_copy(unsigned long addr) +{ + return false; +} +#endif + /* * Parameter block passed down to zap_pte_range in exceptional cases. */ @@ -3207,12 +3244,19 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask); + unsigned long reuse, gfp_t gfp_mask, struct list_head *pages); +#else +static inline int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, + gfp_t gfp_mask, struct list_head *pages) +{ + return -EINVAL; +} #endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap); +unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum); void pmd_init(void *addr); void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); diff --git a/include/linux/oom.h b/include/linux/oom.h index 4623f66ceb318f35b631e6a3db8f4108c95557fc..e52bc37491545678d13b2954a68d0338697cfb8c 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -118,6 +118,7 @@ long oom_badness(struct task_struct *p, extern bool out_of_memory(struct oom_control *oc); extern void exit_oom_victim(void); +extern void mem_cgroup_oom_notify(struct mem_cgroup *memcg); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8111b975b751538561268bb9f7bebe4bd8557f21..c1ea2c7deaaf20dc495e1298579db83527dc8d12 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -342,11 +342,11 @@ static ssize_t hugetext_enabled_show(struct kobject *kobj, int val = 0; if (test_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_ENABLED_FLAG, &transparent_hugepage_flags)) - val |= 0x01; + val |= BIT_MASK_THP_TEXT_FILE; if (test_bit(TRANSPARENT_HUGEPAGE_ANON_TEXT_ENABLED_FLAG, &transparent_hugepage_flags)) - val |= 0x02; + val |= BIT_MASK_THP_TEXT_ANON; if (test_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_DIRECT_FLAG, &transparent_hugepage_flags)) - val |= 0x04; + val |= BIT_MASK_THP_TEXT_FILE_DIRECT; return sprintf(buf, "%d\n", val); } @@ -361,29 +361,29 @@ static ssize_t hugetext_enabled_store(struct kobject *kobj, return -EINVAL; ret = kstrtoul(buf, 0, &val); - if (ret < 0 || val > 7) + if (ret < 0 || val > BIT_MASK_THP_TEXT_ALL) return -EINVAL; /* FILE_TEXT_DIRECT depends on FILE_TEXT_ENABLED */ - if ((val & 0x4) && !(val & 0x1)) + if ((val & BIT_MASK_THP_TEXT_FILE_DIRECT) && !(val & BIT_MASK_THP_TEXT_FILE)) return -EINVAL; ret = count; - if (val & 0x01) + if (val & BIT_MASK_THP_TEXT_FILE) set_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); else clear_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); - if (val & 0x02) + if (val & BIT_MASK_THP_TEXT_ANON) set_bit(TRANSPARENT_HUGEPAGE_ANON_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); else clear_bit(TRANSPARENT_HUGEPAGE_ANON_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); - if (val & 0x04) + if (val & BIT_MASK_THP_TEXT_FILE_DIRECT) set_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_DIRECT_FLAG, &transparent_hugepage_flags); else @@ -753,30 +753,30 @@ static int __init setup_hugetext(char *str) goto out; err = kstrtoul(str, 0, &val); - if (err < 0 || val > 7) + if (err < 0 || val > BIT_MASK_THP_TEXT_ALL) goto out; /* FILE_TEXT_DIRECT depends on FILE_TEXT_ENABLED */ - if ((val & 0x4) && !(val & 0x1)) { + if ((val & BIT_MASK_THP_TEXT_FILE_DIRECT) && !(val & BIT_MASK_THP_TEXT_FILE)) { err = -EINVAL; goto out; } - if (val & 0x01) + if (val & BIT_MASK_THP_TEXT_FILE) set_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); else clear_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); - if (val & 0x02) + if (val & BIT_MASK_THP_TEXT_ANON) set_bit(TRANSPARENT_HUGEPAGE_ANON_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); else clear_bit(TRANSPARENT_HUGEPAGE_ANON_TEXT_ENABLED_FLAG, &transparent_hugepage_flags); - if (val & 0x04) + if (val & BIT_MASK_THP_TEXT_FILE_DIRECT) set_bit(TRANSPARENT_HUGEPAGE_FILE_TEXT_DIRECT_FLAG, &transparent_hugepage_flags); else diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9167fcf8ed638ccaa424d2aaae43569a1005157d..caf8a1bef280ba1a2989a889dd57ee5e58006ded 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -254,7 +254,7 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, - GFP_KERNEL | __GFP_NORETRY | __GFP_NOKFENCE); + GFP_KERNEL | __GFP_NORETRY | __GFP_NOKFENCE, NULL); if (!ret) { ClearHPageVmemmapOptimized(head); static_branch_dec(&hugetlb_optimize_vmemmap_key); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd92b8ebfb1c4801ea5dbb04fa4a86dcd6700cde..285b2f64484719ab33c67ff6f16ccb7db11e1019 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -187,7 +187,6 @@ struct mem_cgroup_event { }; static void mem_cgroup_threshold(struct mem_cgroup *memcg); -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value); static int memory_min_show(struct seq_file *m, void *v); @@ -5683,7 +5682,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) return 0; } -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) +void mem_cgroup_oom_notify(struct mem_cgroup *memcg) { struct mem_cgroup *iter; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9f2bc27d7f18103efbaec6ddaff5467740e50316..990d14e1be4b56d68c045e31667e97140a717f76 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -118,6 +118,10 @@ MODULE_PARM_DESC(auto_movable_ratio, "Set the maximum ratio of MOVABLE:KERNEL memory in the system " "in percent for \"auto-movable\" online policy. Default: 301"); +bool skip_set_contiguous __read_mostly; +module_param(skip_set_contiguous, bool, 0644); +MODULE_PARM_DESC(skip_set_contiguous, "Do not set zone contiguous when online/offline pages"); + /* * memory_hotplug.auto_movable_numa_aware: consider numa node stats */ @@ -1299,7 +1303,8 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) +int __ref add_memory_resource(int nid, struct resource *res, + mhp_t mhp_flags, mhp_t extra_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; struct vmem_altmap mhp_altmap = {}; @@ -1340,7 +1345,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* * Self hosted memmap array */ - if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) || + ((extra_flags & MHP_MEMMAP_ON_MEMORY) && + mhp_memmap_on_memory())) { if (!mhp_supports_memmap_on_memory(size)) { ret = -EINVAL; goto error; @@ -1414,7 +1421,7 @@ int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res, mhp_flags); + ret = add_memory_resource(nid, res, mhp_flags, 0); if (ret < 0) release_memory_resource(res); return ret; @@ -1454,7 +1461,8 @@ EXPORT_SYMBOL_GPL(add_memory); * "System RAM ($DRIVER)". */ int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name, mhp_t mhp_flags) + const char *resource_name, + mhp_t mhp_flags, mhp_t extra_flags) { struct resource *res; int rc; @@ -1472,7 +1480,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size, goto out_unlock; } - rc = add_memory_resource(nid, res, mhp_flags); + rc = add_memory_resource(nid, res, mhp_flags, extra_flags); if (rc < 0) release_memory_resource(res); @@ -1494,6 +1502,7 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, struct zone *zone = NULL; struct page *page; int i; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { @@ -1879,7 +1888,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* Mark all sections offline and remove free pages from the buddy. */ __offline_isolated_pages(start_pfn, end_pfn); - pr_info("Offlined Pages %ld\n", nr_pages); + pr_debug("Offlined Pages %ld\n", nr_pages); /* * The memory sections are marked offline, and the pageblock flags @@ -2167,8 +2176,10 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) * Default is MMOP_OFFLINE - change it only if offlining succeeded, * so try_reonline_memory_block() can do the right thing. */ - if (!rc) + if (!rc) { **online_types = online_type; + mem->nr_vmemmap_pages = 0; + } (*online_types)++; /* Ignore if already offline. */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8de061ffaaf4ed0d265371bcd9ad362afba86790..d63abf27fd22158cb0d5aacf1da096d6e8a192fd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1021,6 +1021,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message) struct mem_cgroup *oom_group; static DEFINE_RATELIMIT_STATE(oom_global_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg; +#endif /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -1042,6 +1045,16 @@ static void oom_kill_process(struct oom_control *oc, const char *message) else if (!is_memcg_oom(oc) && __ratelimit(&oom_global_rs)) dump_global_header(oc, victim); +#ifdef CONFIG_MEMCG + rcu_read_lock(); + memcg = mem_cgroup_from_task(victim); + if (memcg != NULL && memcg != root_mem_cgroup && !is_memcg_oom(oc)) { + css_get(&memcg->css); + mem_cgroup_oom_notify(memcg); + css_put(&memcg->css); + } + rcu_read_unlock(); +#endif /* * Do we need to kill the entire memory cgroup? * Or even one of the ancestor memory cgroups? diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4233ca7252e6606dfa93c0e8c94add40a006922..86e1fe5de0b5eaabf081724107ef56c00048a9c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1895,6 +1895,11 @@ void set_zone_contiguous(struct zone *zone) unsigned long block_start_pfn = zone->zone_start_pfn; unsigned long block_end_pfn; +#ifdef CONFIG_MEMORY_HOTPLUG + if (skip_set_contiguous) + return; +#endif + block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); for (; block_start_pfn < zone_end_pfn(zone); block_start_pfn = block_end_pfn, @@ -9320,7 +9325,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { - pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", + pr_debug_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; goto done; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index adfddcc93a37519a57b2c004c31c37ac2114fc0a..8ea9e10f802608e8533cc271a3505c84958416d5 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -59,8 +59,8 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) pmd_t __pmd; int i; unsigned long addr = start; - struct page *page = pmd_page(*pmd); pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + unsigned long pfn = pmd_pfn(*pmd); if (!pgtable) return -ENOMEM; @@ -71,7 +71,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; - entry = mk_pte(page + i, pgprot); + entry = pfn_pte(pfn + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } @@ -114,7 +114,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, * The reuse_page is found 'first' in table walk before we start * remapping (which is calling @walk->remap_pte). */ - if (!walk->reuse_page) { + if (!virtio_is_use_memmap(walk->reuse_addr) && !walk->reuse_page) { walk->reuse_page = pte_page(*pte); /* * Because the reuse address is part of the range that we are @@ -288,22 +288,44 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, { pgprot_t pgprot = PAGE_KERNEL; struct page *page; - void *to; + void *to, *src; + unsigned long pfn; + pte_t entry; - BUG_ON(pte_page(*pte) != walk->reuse_page); + if (!virtio_is_use_memmap(walk->reuse_addr)) + BUG_ON(pte_page(*pte) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); to = page_to_virt(page); - copy_page(to, (void *)walk->reuse_addr); - reset_struct_pages(to); + if (!virtio_is_use_memmap(walk->reuse_addr)) { + copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); + entry = mk_pte(page, pgprot); + } else { + if (virtio_memmap_restore(walk->reuse_addr)) { + pfn = __pa(to) >> PAGE_SHIFT; + entry = pfn_pte(pfn, pgprot); + } else { + if (virtio_memmap_copy(walk->reuse_addr)) { + pfn = pte_pfn(*pte); + src = __va(__pfn_to_phys(pfn)); + + copy_page(to, src); + list_add_tail(&page->lru, walk->vmemmap_pages); + return; + } + /* trigger an warnning when walk in here now */ + WARN_ON(1); + } + } /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + set_pte_at(&init_mm, addr, pte, entry); } /** @@ -404,7 +426,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, * Return: %0 on success, negative error code otherwise. */ int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask) + unsigned long reuse, gfp_t gfp_mask, struct list_head *altmap_pages) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { @@ -414,17 +436,26 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, }; /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); + if (!virtio_is_use_memmap(reuse)) + BUG_ON(start - reuse != PAGE_SIZE); + + if (virtio_is_use_memmap(reuse)) + walk.vmemmap_pages = altmap_pages; - if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + if (list_empty(walk.vmemmap_pages) && alloc_vmemmap_page_list(start, + end, gfp_mask, walk.vmemmap_pages)) return -ENOMEM; mmap_read_lock(&init_mm); - vmemmap_remap_range(reuse, end, &walk); + if (virtio_is_use_memmap(reuse)) + vmemmap_remap_range(start, end, &walk); + else + vmemmap_remap_range(reuse, end, &walk); mmap_read_unlock(&init_mm); return 0; } +EXPORT_SYMBOL(vmemmap_remap_alloc); #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* diff --git a/mm/sparse.c b/mm/sparse.c index 1fa994760f4c2e4f93f611ce86a3fb1f6384a520..fc13230c1921bb132273333573b1e8c260d73a7e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -304,7 +304,7 @@ static void __init memblocks_present(void) * the identity pfn - section_mem_map will return the actual * physical page frame number. */ -static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) +unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) { unsigned long coded_mem_map = (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); @@ -312,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); return coded_mem_map; } +EXPORT_SYMBOL(sparse_encode_mem_map); #ifdef CONFIG_MEMORY_HOTPLUG /* diff --git a/net/hookers/hookers.c b/net/hookers/hookers.c index dbdae64985dcf14f43f9caa6578a36c295380b30..6e87d48c8c36d5638202103f2b999220ab68a2eb 100644 --- a/net/hookers/hookers.c +++ b/net/hookers/hookers.c @@ -263,12 +263,10 @@ EXPORT_SYMBOL_GPL(hooker_uninstall); #if defined(CONFIG_X86) static inline unsigned int hookers_clear_cr0(void) { - struct static_key *orig_key; unsigned int cr0 = read_cr0(); unsigned long val = cr0 & 0xfffeffff; - orig_key = (struct static_key *)kallsyms_lookup_name("cr_pinning"); - if (!orig_key || !static_key_enabled(orig_key)) + if (!cr_pinning_enabled()) write_cr0(val); else asm volatile("mov %0,%%cr0" : "+r" (val) : : "memory"); diff --git a/net/vtoa/Makefile b/net/vtoa/Makefile index 57d59fac30158d74ec43b35d21673f45e9acb3ae..7da0d9920b17e1bb7139484118a0de9508f7ab8a 100644 --- a/net/vtoa/Makefile +++ b/net/vtoa/Makefile @@ -1,2 +1,2 @@ -obj-m = vtoa.o +obj-$(CONFIG_VTOA) = vtoa.o vtoa-objs := vtoa_main.o vtoa_ctl.o