diff --git a/arch/x86/include/asm/csv.h b/arch/x86/include/asm/csv.h index fb0818fbd9c71d8c0f8f97b1a23055a4d6989923..1b22ff627a6f06b6a2299afab20b97aaa8ef745c 100644 --- a/arch/x86/include/asm/csv.h +++ b/arch/x86/include/asm/csv.h @@ -10,6 +10,8 @@ #ifndef _ASM_X86_CSV_H #define _ASM_X86_CSV_H +#include + #include "csv_command.h" #ifdef CONFIG_HYGON_CSV @@ -27,7 +29,7 @@ extern unsigned int csv_smr_num; extern atomic_long_t csv3_npt_size; extern atomic_long_t csv3_pri_mem; extern unsigned long csv3_meta; -extern atomic_long_t *csv3_shared_mem; +extern atomic_long_t csv3_shared_mem[MAX_NUMNODES]; #endif /* CONFIG_SYSFS */ void __init early_csv_reserve_mem(void); diff --git a/arch/x86/kvm/svm/csv.c b/arch/x86/kvm/svm/csv.c index 00bacb4528cd5e1606916699bb34fcf08d754f8e..bcb0afb32775585f5d685893c99f247483d3f7d0 100644 --- a/arch/x86/kvm/svm/csv.c +++ b/arch/x86/kvm/svm/csv.c @@ -24,6 +24,8 @@ #include "csv.h" #include "x86.h" +#include "trace.h" + #undef pr_fmt #define pr_fmt(fmt) "CSV: " fmt @@ -73,18 +75,35 @@ enum csv_pg_level { /* * Manage shared page in rbtree, the node within the rbtree - * is indexed by gfn. @page points to the page mapped by @gfn + * is indexed by track_hva. @track_page points to the page mapped by @track_hva * in NPT. */ struct shared_page { struct rb_node node; - gfn_t gfn; - struct page *page; + /* The hva corresponds to @track_page. */ + u64 track_hva; + /* Page order of @track_page. */ + unsigned int order; + /* + * This field is NULL when @order equals 0, but must point to a + * valid bitmap when @order > 0 (i.e., for huge pages and THP). + */ + unsigned long *bitmap; + /* + * Pointer to the head page of the page block. + * Always points to the head page, whether @order is 0 (a single 4K + * page) or greater (a huge page or THP). + */ + struct page *track_page; }; struct shared_page_mgr { + /* The root of the manager tree */ struct rb_root root; + /* The count of shared_page entries in the manager tree */ u64 count; + /* The total pages tracked by the manager */ + unsigned long nr_pages; }; struct kvm_csv_info { @@ -125,7 +144,40 @@ struct secure_memory_region { static struct kvm_x86_ops csv_x86_ops; -static bool shared_page_insert(struct shared_page_mgr *mgr, struct shared_page *sp) +#ifdef CONFIG_SYSFS +static void update_csv_share_mem(struct page *page, bool add) +{ + int nid; + + nid = page_to_nid(page); + if (add) + atomic_long_add(page_size(page), &csv3_shared_mem[nid]); + else + atomic_long_sub(page_size(page), &csv3_shared_mem[nid]); +} +#else +static void update_csv_share_mem(struct page *page, bool add) { }; +#endif /* CONFIG_SYSFS */ + +/** + * insert_shared_page_entry_locked - Insert a shared_page into the manager tree + * @mgr: Pointer to the shared page manager (must be valid). + * @new_sp: The shared_page to insert. + * @old_sp: Pointer to store conflicting entry if insertion fails (can be NULL). + * + * Attempts to insert @new_sp into the rbtree of @mgr. If an entry with the + * same key (e.g., track_hva) already exists, the insertion is aborted and + * the existing entry is returned via @old_sp. + * + * The caller must hold the appropriate lock protecting @mgr. + * + * Return: + * %true if @new_sp was successfully inserted; + * %false if a duplicate exists (in which case @old_sp is set if non-NULL). + */ +static bool insert_shared_page_entry_locked(struct shared_page_mgr *mgr, + struct shared_page *new_sp, + struct shared_page **old_sp) { struct shared_page *sp_iter; struct rb_root *root; @@ -140,57 +192,283 @@ static bool shared_page_insert(struct shared_page_mgr *mgr, struct shared_page * sp_iter = rb_entry(*new, struct shared_page, node); parent = *new; - if (sp->gfn < sp_iter->gfn) + if (new_sp->track_hva < sp_iter->track_hva) { new = &((*new)->rb_left); - else if (sp->gfn > sp_iter->gfn) + } else if (new_sp->track_hva > sp_iter->track_hva) { new = &((*new)->rb_right); - else + } else { + trace_kvm_csv3_sp_insert_dup(page_to_pfn(new_sp->track_page), + new_sp->track_hva, + new_sp->order, + page_to_pfn(sp_iter->track_page), + sp_iter->track_hva, + sp_iter->order); + + /* + * If found the same @track_page during insertion, the + * @track_page must be pinned more than once. + */ + if (new_sp->track_page == sp_iter->track_page && + ((hpage_pincount_available(sp_iter->track_page) && + compound_pincount(sp_iter->track_page) < 2) || + (!hpage_pincount_available(sp_iter->track_page) && + ((unsigned int)page_ref_count(sp_iter->track_page) + < GUP_PIN_COUNTING_BIAS * 2)))) + pr_err_ratelimited("%s: SP_MGR_ERR: pfn:0x%lx" + " order:%d pincount < 2\n", + __func__, + page_to_pfn(sp_iter->track_page), + sp_iter->order); + + if (old_sp) + *old_sp = sp_iter; return false; + } } + trace_kvm_csv3_sp_insert(page_to_pfn(new_sp->track_page), + new_sp->track_hva, + new_sp->order); + /* Add new node and rebalance tree. */ - rb_link_node(&sp->node, parent, new); - rb_insert_color(&sp->node, root); + rb_link_node(&new_sp->node, parent, new); + rb_insert_color(&new_sp->node, root); + + /* Update shared page statistics */ mgr->count++; + mgr->nr_pages += 1UL << new_sp->order; + update_csv_share_mem(new_sp->track_page, true); return true; } -static struct shared_page *shared_page_search(struct shared_page_mgr *mgr, gfn_t gfn) +/** + * search_shared_page_entry_locked - Search for a shared page covering @hva + * @mgr: Pointer to the shared page manager. + * @hva: Host virtual address to search for. + * + * Searches the rbtree of @mgr for a shared page that maps @hva. The search + * iterates through supported page orders (e.g., 4K, 2M, 1G) in ascending order + * (starting from order 0). + * + * The caller must hold the lock protecting @mgr. + * + * Return: Pointer to matching shared_page, or NULL if not found. + */ +static +struct shared_page *search_shared_page_entry_locked(struct shared_page_mgr *mgr, + u64 hva) { struct shared_page *sp; struct rb_root *root; struct rb_node *node; + u64 track_hva; + unsigned int order = 0; + +again: + track_hva = (hva & ~((1ULL << (order + PAGE_SHIFT)) - 1)); root = &mgr->root; node = root->rb_node; while (node) { sp = rb_entry(node, struct shared_page, node); - if (gfn < sp->gfn) + if (track_hva < sp->track_hva) node = node->rb_left; - else if (gfn > sp->gfn) + else if (track_hva > sp->track_hva) node = node->rb_right; else - return sp; + return (track_hva == hva || sp->order == order) + ? sp : NULL; + } + + if (order == 0) { + order = PMD_SHIFT - PAGE_SHIFT; + goto again; + } else if (order == (PMD_SHIFT - PAGE_SHIFT)) { + order = PUD_SHIFT - PAGE_SHIFT; + goto again; } return NULL; +} + +/** + * shared_page_entry_set_bit_locked - Set bitmap in a shared page entry if found + * @mgr: Pointer to the shared page manager (caller must hold its lock). + * @hva: Host virtual address identifying the 4K subpage of a compound page. + * @sp: Optional hint pointer to the shared_page entry; if NULL, the entry is + * looked up in @mgr's rbtree using @hva. + * + * If @sp is NULL, the function searches for the entry in the manager's tree. + * + * For compound pages (@order > 0), a bitmap tracks which 4K subpages are + * shared memory for CSV3 VM. The bit corresponding to the offset of @hva within + * the compound page is set atomically. + * + * On every successful found (including repeated hits), a trace event + * kvm_csv3_sp_hit is emitted with the PFN, order. + * + * Return: + * Pointer to the shared_page entry if found, or NULL if no entry covers @hva. + */ +static +struct shared_page *shared_page_entry_set_bit_locked(struct shared_page_mgr *mgr, + u64 hva, + struct shared_page *sp) +{ + /* If @sp is NULL, we need search entry from the manager tree. */ + if (!sp) + sp = search_shared_page_entry_locked(mgr, hva); + if (sp) { + if (sp->order) { + unsigned int pg_off = (hva & ~sp->track_hva) >> PAGE_SHIFT; + + set_bit(pg_off, sp->bitmap); + } + + trace_kvm_csv3_sp_hit(page_to_pfn(sp->track_page), + sp->track_hva, hva, sp->order); + } + + return sp; } -static struct shared_page *shared_page_remove(struct shared_page_mgr *mgr, gfn_t gfn) +/** + * remove_shared_page_entry_locked - Remove a shared_page entry if any subpages + * are not shared memory of CSV3 VM. + * @mgr: Pointer to the shared page manager (lock must be held by caller). + * @hva: Host virtual address identifying the subpage. + * + * This function attempts to remove a shared_page entry from the manager when it + * does not contain CSV3 VM's shared memory. The entry is only removed if: + * - All 4K subpages within the compound page are not VM's shared memory + * - The underlying physical page is not pinned by any other user + * + * If the page is found to be pinned more than once, the removal is aborted + * the page is used by multiple users. + * + * Context: Caller must hold the lock protecting @mgr. + * + * Return: + * Pointer to the removed shared_page entry on success, or NULL if not found, + * some bits set in @bitmap, or page is pinned more than once. + */ +static +struct shared_page *remove_shared_page_entry_locked(struct shared_page_mgr *mgr, + u64 hva) { struct shared_page *sp; - sp = shared_page_search(mgr, gfn); + sp = search_shared_page_entry_locked(mgr, hva); if (sp) { + /* + * The bitmap records the 4K subpages within the compound page + * that are shared memory of VM. Clear the bit for this @hva. + */ + if (sp->order) { + unsigned int pg_off = (hva & ~sp->track_hva) >> PAGE_SHIFT; + + clear_bit(pg_off, sp->bitmap); + if (!bitmap_empty(sp->bitmap, 1U << sp->order)) + return NULL; + } + + /* + * The @sp->track_page may be pinned more than once in some + * scenarios, such as device passthrough. We don't remove this + * entry from the tree. + */ + if ((hpage_pincount_available(sp->track_page) && + compound_pincount(sp->track_page) >= 2) || + (!hpage_pincount_available(sp->track_page) && + ((unsigned int)page_ref_count(sp->track_page) >= GUP_PIN_COUNTING_BIAS * 2))) { + return NULL; + } + + trace_kvm_csv3_sp_remove(page_to_pfn(sp->track_page), + sp->track_hva, sp->order); + rb_erase(&sp->node, &mgr->root); mgr->count--; + mgr->nr_pages -= 1UL << sp->order; + update_csv_share_mem(sp->track_page, false); } return sp; } +/** + * alloc_shared_page_entry - Allocate and initialize a shared_page entry for a + * given page + * @csv: Pointer to the KVM CSV info structure containing the slab cache. + * @hva: Host virtual address associated with the page (used for alignment and + * tracking). + * @page: The physical page being tracked (may be a base or compound/huge page). + * + * Allocates a new struct shared_page from the slab cache in @csv, initializes + * it to represent the memory region covered by @page at virtual address @hva. + * If @page is a compound page (order > 0), a bitmap is allocated to track + * touched 4K subpages, and the bit corresponding to the 4K offset of @hva + * within the compound page is set. For order-0 pages, no bitmap is allocated. + * The @track_hva field is aligned to the start of the page block (i.e., + * compound page boundary if applicable). + * + * Return: + * Pointer to the newly allocated and initialized shared_page on success, + * or NULL on allocation failure (either slab or bitmap). + */ +struct shared_page *alloc_shared_page_entry(struct kvm_csv_info *csv, + u64 hva, + struct page *page) +{ + struct shared_page *sp; + unsigned int order; + unsigned int pg_off; + + sp = kmem_cache_zalloc(csv->sp_slab, GFP_KERNEL); + if (!sp) + return NULL; + + order = compound_order(compound_head(page)); + if (order) { + unsigned long *bitmap = kvzalloc(BITS_TO_LONGS(1U << order) + * sizeof(unsigned long), + GFP_KERNEL); + if (!bitmap) { + kmem_cache_free(csv->sp_slab, sp); + return NULL; + } + sp->bitmap = bitmap; + pg_off = (hva >> PAGE_SHIFT) & ((1U << order) - 1); + set_bit(pg_off, sp->bitmap); + } + + sp->track_hva = hva & ~((1ULL << (order + PAGE_SHIFT)) - 1); + sp->order = order; + sp->track_page = compound_head(page); + + return sp; +} + +/** + * free_shared_page_entry - Free a shared_page entry and its associated + * resources + * @csv: Pointer to the KVM CSV info structure containing the slab cache. + * @sp: The shared_page entry to free. + * + * The caller must ensure that @sp is no longer in use (e.g., already removed + * from any rbtree) before calling this function. + */ +void free_shared_page_entry(struct kvm_csv_info *csv, struct shared_page *sp) +{ + if (!sp) + return; + + kvfree(sp->bitmap); + kmem_cache_free(csv->sp_slab, sp); +} + static inline struct kvm_svm_csv *to_kvm_svm_csv(struct kvm *kvm) { return (struct kvm_svm_csv *)container_of(kvm, struct kvm_svm, kvm); @@ -1510,34 +1788,17 @@ static int __csv_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, return r; } -#ifdef CONFIG_SYSFS -static void update_csv_share_mem(struct page *page, bool add) +static int csv_pin_shared_memory_locked(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, + gfn_t gfn, + kvm_pfn_t *pfn) { - int nid; - struct page *h_page; - - h_page = compound_head(page); - nid = page_to_nid(page); - if (add) - atomic_long_add(page_size(h_page), &csv3_shared_mem[nid]); - else - atomic_long_sub(page_size(h_page), &csv3_shared_mem[nid]); -} -#else -static void update_csv_share_mem(struct page *page, bool add) { }; -#endif /* CONFIG_SYSFS */ - -static int csv_pin_shared_memory(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t *pfn) -{ - struct page *page; u64 hva; - int npinned; kvm_pfn_t tmp_pfn; struct kvm *kvm = vcpu->kvm; struct kvm_csv_info *csv = &to_kvm_svm_csv(kvm)->csv_info; struct shared_page *sp; + unsigned int pg_off; bool write = !(slot->flags & KVM_MEM_READONLY); bool is_dma_pinned; @@ -1552,38 +1813,74 @@ static int csv_pin_shared_memory(struct kvm_vcpu *vcpu, return 0; } + hva = __gfn_to_hva_memslot(slot, gfn); + is_dma_pinned = page_maybe_dma_pinned(pfn_to_page(tmp_pfn)); kvm_release_pfn_clean(tmp_pfn); if (is_dma_pinned) { + /* + * If we have already pinned the page when traversing the + * memslot, and the pinned page is part of a compound page that + * tracked by shared page entry, we must set the corresponding + * bit in the @bitmap to reflect that this subpage is shared + * memory for CSV3 VM. + * + * This ensures that all 4K subpages within a compound page can + * be accurately tracked, preventing premature removal of pages + * that are still be shared memory. + */ + shared_page_entry_set_bit_locked(&csv->sp_mgr, hva, NULL); *pfn = tmp_pfn; return 0; } - sp = shared_page_search(&csv->sp_mgr, gfn); - if (!sp) { - sp = kmem_cache_zalloc(csv->sp_slab, GFP_KERNEL); - if (!sp) - return -ENOMEM; - - hva = __gfn_to_hva_memslot(slot, gfn); + /* + * If the shared page manager does not already track @hva: + * - Pin the page using pin_user_pages(); + * - Allocate a new shared_page entry, if the pinned page is part of + * a compound page, allocate a @bitmap and set the bit corresponding + * to this 4K subpage. + * + * If the @hva is already tracked by the manager, set the corresponding + * bit in the @bitmap. + */ + sp = shared_page_entry_set_bit_locked(&csv->sp_mgr, hva, NULL); + if (unlikely(sp)) { + pr_err_ratelimited("%s: not pinned but in the tree\n", __func__); + pg_off = (hva & ~sp->track_hva) >> PAGE_SHIFT; + *pfn = page_to_pfn(sp->track_page) + pg_off; + } else { + struct shared_page *old_sp; + struct page *page; + int npinned; mmap_write_lock(current->mm); npinned = pin_user_pages(hva, 1, FOLL_WRITE | FOLL_LONGTERM, &page, NULL); + mmap_write_unlock(current->mm); if (npinned != 1) { - mmap_write_unlock(current->mm); - kmem_cache_free(csv->sp_slab, sp); pr_err_ratelimited("Failure pin gfn:0x%llx\n", gfn); return -ENOMEM; } - mmap_write_unlock(current->mm); - sp->page = page; - sp->gfn = gfn; - shared_page_insert(&csv->sp_mgr, sp); - update_csv_share_mem(page, true); - } + sp = alloc_shared_page_entry(csv, hva, page); + if (!sp) { + unpin_user_page(page); + return -ENOMEM; + } + + if (insert_shared_page_entry_locked(&csv->sp_mgr, sp, &old_sp)) { + *pfn = page_to_pfn(page); + } else { + pr_err_ratelimited("%s: search fail but insertion found\n", __func__); + unpin_user_page(page); + free_shared_page_entry(csv, sp); - *pfn = page_to_pfn(sp->page); + shared_page_entry_set_bit_locked(&csv->sp_mgr, hva, old_sp); + + pg_off = (hva & ~old_sp->track_hva) >> PAGE_SHIFT; + *pfn = page_to_pfn(old_sp->track_page) + pg_off; + } + } return 0; } @@ -1657,7 +1954,7 @@ static int csv_page_fault(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, level = CSV_PG_LEVEL_4K; else { mutex_lock(&csv->sp_lock); - ret = csv_pin_shared_memory(vcpu, slot, gfn, &pfn); + ret = csv_pin_shared_memory_locked(vcpu, slot, gfn, &pfn); mutex_unlock(&csv->sp_lock); if (ret) { /* Resume guest to retry #NPF. */ @@ -1681,33 +1978,232 @@ static int csv_page_fault(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } /** - * Return negative error code on fail, - * or return the number of pages unpinned successfully + * csv_release_shared_memory - Release shared pages and notify userspace to + * madvise. + * @params: Pointer to ioctl input/output structure, containing GPA range + * and output fields. + * + * This function processes the KVM_CSV3_RELEASE_SHARED_MEMORY command by: + * - Iterating over each page in the specified GPA range; + * - For each page, attempting to remove it from the shared page manager; + * - If successful, updating the total number of unpinned pages; + * - Setting the start_hva to the first HVA that needs madvise (for + * contiguous range). + * + * The kernel ensures that: + * - Only one thread can modify the shared page manager at a time (via + * sp_lock); + * - The start_hva is set to the lowest HVA among all removed pages; + * - Userspace VMM should call madvise() on [start_hva, start_hva + + * unpinned * PAGE_SIZE). + * + * Return: 0 on success, negative error code on failure (though none are + * currently returned). */ -static int csv_unpin_shared_memory(struct kvm *kvm, gpa_t gpa, u32 num_pages) +static int csv_release_shared_memory(struct kvm *kvm, + struct kvm_csv_handle_memory *params) { - struct kvm_csv_info *csv; + struct kvm_csv_info *csv = &to_kvm_svm_csv(kvm)->csv_info; struct shared_page *sp; - gfn_t gfn; - unsigned long i; - int unpin_cnt = 0; + u64 hva; + gfn_t gfn = gpa_to_gfn(params->gpa); + u32 num_pages = params->num_pages; + int i; - csv = &to_kvm_svm_csv(kvm)->csv_info; - gfn = gpa_to_gfn(gpa); + /* Initialize output fields */ + params->start_hva = 0; + params->unpinned = 0; + params->handled0 = 0; + + /* Protect shared page manager from concurrent access */ mutex_lock(&csv->sp_lock); for (i = 0; i < num_pages; i++, gfn++) { - sp = shared_page_remove(&csv->sp_mgr, gfn); + hva = gfn_to_hva(kvm, gfn); + + if (unlikely(!params->start_hva)) + params->start_hva = hva; + + /* + * The madvise requested from user space need provide hva and + * length. This handler should fill back a range of contiguous + * hva. We maintain start_hva as the smallest hva among all + * released pages. + */ + if ((params->start_hva >> PAGE_SHIFT) + + params->unpinned != (hva >> PAGE_SHIFT)) + break; + + /* + * Try to remove the shared_page entry corresponding to this + * hva. If remove_shared_page_entry_locked() found and removed + * the entry from manager tree, it will return the entry, and + * we can release the page tracked by the entry. + */ + sp = remove_shared_page_entry_locked(&csv->sp_mgr, hva); if (sp) { - update_csv_share_mem(sp->page, false); - unpin_user_page(sp->page); - kmem_cache_free(csv->sp_slab, sp); - csv->sp_mgr.count--; - unpin_cnt++; + /* Update @unpinned only when an sp is removed */ + params->unpinned += 1U << sp->order; + + /* + * If this page is a tail page of a compound page, its + * HVA may be smaller than the head page's HVA. Since + * madvise must cover the entire compound page, we + * update start_hva to the smallest HVA in the range. + */ + if (sp->track_hva < params->start_hva) { + params->handled0 = (1U << sp->order) - + ((params->start_hva - sp->track_hva) >> PAGE_SHIFT); + params->start_hva = sp->track_hva; + } else { + params->handled0 += 1U << sp->order; + } + + if (page_maybe_dma_pinned(sp->track_page)) + unpin_user_page(sp->track_page); + else + pr_err_ratelimited("%s: the track_page was not pinned\n", + __func__); + free_shared_page_entry(csv, sp); + } else { + break; } + + cond_resched(); } mutex_unlock(&csv->sp_lock); - return unpin_cnt; + return 0; +} + +static inline unsigned long get_vma_flags(unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long vm_flags = 0; + + down_read(&mm->mmap_lock); + vma = find_vma(mm, addr); + if (vma && vma->vm_start <= addr) + vm_flags = vma->vm_flags; + up_read(&mm->mmap_lock); + + return vm_flags; +} + +#define RETRY_CSV3_PIN_MAX 5 +#define RETRY_CSV3_ALLOC_SP_MAX 5 + +/** + * csv_get_shared_memory - Pin a number of pages specified in the input. + * @params: Pointer to ioctl input/output structure, containing GPA range and + * output field. + * + * This function processes the KVM_CSV3_GET_SHARED_MEMORY command by: + * - Iterating over each page in the specified GPA range; + * - For each page, checking if it is already tracked in the shared page + * manager; + * - If not, attempting to pin it and create a new entry; + * - Incrementing @pinned for every page that is successfully pinned. + * - Incrementing @handled for every GPA that is handled. + * + * Return: 0 on success, negative error code on failure. + */ +static int csv_get_shared_memory(struct kvm *kvm, + struct kvm_csv_handle_memory *params) +{ + struct kvm_csv_info *csv = &to_kvm_svm_csv(kvm)->csv_info; + struct shared_page *sp, *old_sp; + struct page *page; + unsigned long vm_flags = 0; + u64 hva; + gfn_t gfn = gpa_to_gfn(params->gpa); + u32 num_pages = params->num_pages; + int i, npinned, try_pin, try_alloc_sp; + int ret = 0; + + /* Initialize output field */ + params->pinned = 0; + params->handled1 = 0; + + /* Protect shared page manager from concurrent access */ + mutex_lock(&csv->sp_lock); + for (i = 0; i < num_pages; i++, gfn++) { + hva = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(hva)) { + /* + * If the HVA is invalid (e.g., not mapped), skip this + * page. This prevents unnecessary pinning attempts and + * continues processing remaining pages. + */ + params->handled1++; + continue; + } + + if (unlikely(!vm_flags)) + vm_flags = get_vma_flags((unsigned long)hva); + + /* + * Reject hugetlbfs-backed pages to prevent severe page + * migration pressure when 2M hugetlb is used with CMA. + */ + if (!vm_flags || (vm_flags & VM_HUGETLB)) { + ret = -EINVAL; + goto out; + } + + sp = shared_page_entry_set_bit_locked(&csv->sp_mgr, hva, NULL); + if (sp) { + params->handled1++; + continue; + } + + try_pin = 0; + try_alloc_sp = 0; +retry_pin: + /* + * The page is not yet tracked. We need to pin it and create a + * new entry. Acquire mm_write_lock to safely pin the page. + */ + mmap_write_lock(current->mm); + npinned = pin_user_pages(hva, 1, FOLL_WRITE | FOLL_LONGTERM, &page, NULL); + mmap_write_unlock(current->mm); + if (npinned != 1) { + if (++try_pin <= RETRY_CSV3_PIN_MAX) + goto retry_pin; + pr_err_ratelimited("%s: try pin fail\n", __func__); + break; + } + +retry_alloc_sp: + sp = alloc_shared_page_entry(csv, hva, page); + if (!sp) { + if (++try_alloc_sp <= RETRY_CSV3_ALLOC_SP_MAX) + goto retry_alloc_sp; + pr_err_ratelimited("%s: try alloc sp fail\n", __func__); + unpin_user_page(page); + break; + } + + if (!insert_shared_page_entry_locked(&csv->sp_mgr, sp, &old_sp)) { + pr_err_ratelimited("%s: search fail but insertion found\n", + __func__); + unpin_user_page(page); + free_shared_page_entry(csv, sp); + + shared_page_entry_set_bit_locked(&csv->sp_mgr, hva, old_sp); + } else { + /* Update @pinned only upon insertion of a new sp */ + params->pinned += 1U << sp->order; + } + params->handled1++; + + cond_resched(); + } + +out: + mutex_unlock(&csv->sp_lock); + + return ret; } static void csv_vm_destroy(struct kvm *kvm) @@ -1725,14 +2221,28 @@ static void csv_vm_destroy(struct kvm *kvm) mutex_lock(&csv->sp_lock); while ((node = rb_first(&csv->sp_mgr.root))) { sp = rb_entry(node, struct shared_page, node); - update_csv_share_mem(sp->page, false); + /* Remove shared page entry from the manager tree */ rb_erase(&sp->node, &csv->sp_mgr.root); - unpin_user_page(sp->page); - kmem_cache_free(csv->sp_slab, sp); + /* Update shared page statistics */ csv->sp_mgr.count--; + csv->sp_mgr.nr_pages -= 1UL << sp->order; + update_csv_share_mem(sp->track_page, false); + /* Putback the tracked page to system */ + if (page_maybe_dma_pinned(sp->track_page)) + unpin_user_page(sp->track_page); + else + pr_err_ratelimited("%s: the track_page was not pinned\n", + __func__); + free_shared_page_entry(csv, sp); + + cond_resched(); } mutex_unlock(&csv->sp_lock); + if (csv->sp_mgr.count || csv->sp_mgr.nr_pages) + pr_err("%s: SP_MGR_ERR: track fault, cnt:%lld nr_pages:0x%lx\n", + __func__, csv->sp_mgr.count, csv->sp_mgr.nr_pages); + kmem_cache_destroy(csv->sp_slab); csv->sp_slab = NULL; @@ -1817,10 +2327,12 @@ static void csv_guest_memory_reclaimed(struct kvm *kvm) static int csv_handle_memory(struct kvm *kvm, struct kvm_sev_cmd *argp) { + struct kvm_csv_info *csv = &to_kvm_svm_csv(kvm)->csv_info; struct kvm_csv_handle_memory params; int r = -EINVAL; - if (!csv3_guest(kvm)) + if (!csv3_guest(kvm) || + !(csv->inuse_ext & KVM_CAP_HYGON_COCO_EXT_CSV3_SP_MGR)) return -ENOTTY; if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, @@ -1829,12 +2341,20 @@ static int csv_handle_memory(struct kvm *kvm, struct kvm_sev_cmd *argp) switch (params.opcode) { case KVM_CSV3_RELEASE_SHARED_MEMORY: - r = csv_unpin_shared_memory(kvm, params.gpa, params.num_pages); + r = csv_release_shared_memory(kvm, ¶ms); break; - default: + case KVM_CSV3_GET_SHARED_MEMORY: + r = csv_get_shared_memory(kvm, ¶ms); break; + default: + goto out; } + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, + sizeof(params))) + return -EFAULT; + +out: return r; }; @@ -2040,6 +2560,7 @@ static int csv_get_hygon_coco_extension(struct kvm *kvm) csv->kvm_ext |= KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET; if (csv->fw_ext & CSV_EXT_CSV3_LFINISH_EX) csv->kvm_ext |= KVM_CAP_HYGON_COCO_EXT_CSV3_LFINISH_EX; + csv->kvm_ext |= KVM_CAP_HYGON_COCO_EXT_CSV3_SP_MGR; } csv->kvm_ext_valid = true; } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e06142d1208ae9bf875ad4c380e9102b3ed3355c..a697fe11a561b743ce194e2b7dbb772e0a04c638 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1675,6 +1675,106 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) ); +/* + * Tracepoint for the Hygon CSV3 shared page processing + */ +TRACE_EVENT(kvm_csv3_sp_insert_dup, + TP_PROTO(unsigned long n_track_pfn, + u64 n_track_hva, + unsigned int n_order, + unsigned long o_track_pfn, + u64 o_track_hva, + unsigned int o_order), + TP_ARGS(n_track_pfn, n_track_hva, n_order, + o_track_pfn, o_track_hva, o_order), + + TP_STRUCT__entry( + __field(unsigned long, n_track_pfn) + __field(u64, n_track_hva) + __field(unsigned int, n_order) + __field(unsigned long, o_track_pfn) + __field(u64, o_track_hva) + __field(unsigned int, o_order) + ), + + TP_fast_assign( + __entry->n_track_pfn = n_track_pfn; + __entry->n_track_hva = n_track_hva; + __entry->n_order = n_order; + __entry->o_track_pfn = o_track_pfn; + __entry->o_track_hva = o_track_hva; + __entry->o_order = o_order; + ), + + TP_printk("n_sp pfn:0x%lx, hva:0x%llx, order:%u " + "o_sp pfn:0x%lx, hva:0x%llx, order:%u", + __entry->n_track_pfn, __entry->n_track_hva, __entry->n_order, + __entry->o_track_pfn, __entry->o_track_hva, __entry->o_order) +); + +TRACE_EVENT(kvm_csv3_sp_insert, + TP_PROTO(unsigned long n_track_pfn, u64 n_track_hva, unsigned int n_order), + TP_ARGS(n_track_pfn, n_track_hva, n_order), + + TP_STRUCT__entry( + __field(unsigned long, n_track_pfn) + __field(u64, n_track_hva) + __field(unsigned int, n_order) + ), + + TP_fast_assign( + __entry->n_track_pfn = n_track_pfn; + __entry->n_track_hva = n_track_hva; + __entry->n_order = n_order; + ), + + TP_printk("n_sp pfn:0x%lx, hva:0x%llx order:%u", + __entry->n_track_pfn, __entry->n_track_hva, __entry->n_order) +); + +TRACE_EVENT(kvm_csv3_sp_hit, + TP_PROTO(unsigned long track_pfn, u64 track_hva, u64 hva, unsigned int order), + TP_ARGS(track_pfn, track_hva, hva, order), + + TP_STRUCT__entry( + __field(unsigned long, track_pfn) + __field(u64, track_hva) + __field(u64, hva) + __field(unsigned int, order) + ), + + TP_fast_assign( + __entry->track_pfn = track_pfn; + __entry->track_hva = track_hva; + __entry->hva = hva; + __entry->order = order; + ), + + TP_printk("sp pfn:0x%lx, hva:0x%llx, subhva:0x%llx, order:%u", + __entry->track_pfn, __entry->track_hva, + __entry->hva, __entry->order) +); + +TRACE_EVENT(kvm_csv3_sp_remove, + TP_PROTO(unsigned long track_pfn, u64 track_hva, unsigned int order), + TP_ARGS(track_pfn, track_hva, order), + + TP_STRUCT__entry( + __field(unsigned long, track_pfn) + __field(u64, track_hva) + __field(unsigned int, order) + ), + + TP_fast_assign( + __entry->track_pfn = track_pfn; + __entry->track_hva = track_hva; + __entry->order = order; + ), + + TP_printk("sp pfn:0x%lx, hva:0x%llx order:%u", + __entry->track_pfn, __entry->track_hva, __entry->order) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 383c051813ffc26b250d592312b9e4f5c51257c0..30d8815d1942fc90057cde4348712c875daca53a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12187,6 +12187,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_insert_dup); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_insert); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_hit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_csv3_sp_remove); static int __init kvm_x86_init(void) { diff --git a/arch/x86/mm/csv.c b/arch/x86/mm/csv.c index 08653c91a2af56bdf2a91c9149f8a8c6ed54e5c4..90ab6fbc4409f3f7086c60405ac8c454b34b118c 100644 --- a/arch/x86/mm/csv.c +++ b/arch/x86/mm/csv.c @@ -90,11 +90,14 @@ unsigned int csv_smr_num; EXPORT_SYMBOL_GPL(csv_smr_num); #ifdef CONFIG_SYSFS -/* - * Global counters exposed via sysfs /sys. Updated atomically during VM creation/destruction. +/** + * Global counters exposed via /sys/kernel/mm/csv3_cma/mem_info. Updated + * atomically during VM creation/destruction. + * * csv3_npt_size: total size of NPT tables allocated. * csv3_pri_mem: total private memory allocated for CSV guests. * csv3_meta: metadata overhead for CSV memory regions. + * csv3_shared_mem: size of all the CSV3 VMs' shared memory. */ atomic_long_t csv3_npt_size = ATOMIC_LONG_INIT(0); EXPORT_SYMBOL_GPL(csv3_npt_size); @@ -105,7 +108,7 @@ EXPORT_SYMBOL_GPL(csv3_pri_mem); unsigned long csv3_meta; EXPORT_SYMBOL_GPL(csv3_meta); -atomic_long_t *csv3_shared_mem; +atomic_long_t csv3_shared_mem[MAX_NUMNODES]; EXPORT_SYMBOL_GPL(csv3_shared_mem); #endif @@ -480,8 +483,6 @@ static struct kobject *csv_cma_kobj_root; static int __init csv_cma_sysfs_init(void) { - int node_count; - size_t mem_size; int err, i; if (!is_x86_vendor_hygon() || !boot_cpu_has(X86_FEATURE_CSV3)) @@ -495,20 +496,7 @@ static int __init csv_cma_sysfs_init(void) if (err) goto out; - node_count = num_online_nodes(); - if (node_count <= 0) { - pr_err("No online NUMA nodes detected\n"); - goto out; - } - - mem_size = node_count * sizeof(atomic_long_t); - csv3_shared_mem = kzalloc(mem_size, GFP_KERNEL); - if (!csv3_shared_mem) { - pr_err("Failed to allocate shared memory\n"); - goto out; - } - - for (i = 0; i < node_count; i++) + for (i = 0; i < MAX_NUMNODES; i++) atomic_long_set(&csv3_shared_mem[i], 0); return 0; @@ -526,9 +514,6 @@ static void csv_cma_sysfs_exit(void) */ if (csv_cma_kobj_root != NULL) kobject_put(csv_cma_kobj_root); - - kfree(csv3_shared_mem); - csv3_shared_mem = NULL; } #else /* !CONFIG_SYSFS */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 853667e4e123bcd9444c3fc42216a606458eb123..73ccbd6eb8669ab640de3be50b0d70ee333ec716 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1108,6 +1108,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET (1 << 2) /* support finish launch process by CSV3_CMD_LAUNCH_FINISH_EX firmware API */ #define KVM_CAP_HYGON_COCO_EXT_CSV3_LFINISH_EX (1 << 3) +/* support userspace to request management of CSV3 shared pages */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_SP_MGR (1 << 4) #define KVM_CAP_ARM_CPU_FEATURE 555 @@ -1999,11 +2001,71 @@ struct kvm_csv_receive_encrypt_context { __u32 trans_len; }; -#define KVM_CSV3_RELEASE_SHARED_MEMORY (0x0001) - +/** + * struct kvm_csv_handle_memory - IOCTL data structure for CSV3 memory + * operations. + * + * This union is used to pass input and output parameters between userspace + * and kernel for KVM_CSV3 memory management commands. + * + * It supports two ioctl commands: + * - KVM_CSV3_RELEASE_SHARED_MEMORY: Notify userspace VMM to madvise pages + * as unused, releasing shared memory mappings. + * - KVM_CSV3_GET_SHARED_MEMORY: Query how many pages are currently pinned + * at a given GPA range. + * + * The union contains three nested structs: + * - Input (for both commands) + * - Output (for KVM_CSV3_RELEASE_SHARED_MEMORY) + * - Output (for KVM_CSV3_GET_SHARED_MEMORY) + */ struct kvm_csv_handle_memory { - __u64 gpa; - __u32 num_pages; - __u32 opcode; + union { + /* Input of the ioctl command. */ + /** + * @gpa: The start guest physical address (GPA) to be handled. + * @num_pages: The number of consecutive pages starting at + * @gpa. + * @opcode: The command ID to determine which operation to + * perform. + **/ + struct { + __u64 gpa; + __u32 num_pages; +#define KVM_CSV3_RELEASE_SHARED_MEMORY 0x0001 +#define KVM_CSV3_GET_SHARED_MEMORY 0x0002 + __u32 opcode; + }; + /* Output of ioctl command KVM_CSV3_RELEASE_SHARED_MEMORY. */ + /** + * @start_hva: The start host virtual address (HVA) that + * userspace VMM should madvise as unused. + * @unpinned: The number of pages that were successfully + * unpinned and can now be madvised in userspace. + * @unused0: Padding for alignment. + * @handled0: The number of pages start from GPA are handled. + * The userspace VMM need this information to move + * forward. + */ + struct { + __u64 start_hva; + __u32 unpinned; + __u32 handled0; + }; + /* Output of ioctl command KVM_CSV3_GET_SHARED_MEMORY. */ + /** + * @unused1: Padding for alignment. + * @pinned: The number of pages currently pinned for the + * specified GPA range. + * @handled1: The number of pages start from GPA are handled. + * The userspace VMM need this information to move + * forward. + */ + struct { + __u64 unused1; + __u32 pinned; + __u32 handled1; + }; + }; }; #endif /* __LINUX_KVM_H */