diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index b21a34c34a211d06fe4cbf9fa78281c0c3f821f9..88fa495abbaca445a15258fb3e2c4a8d2da10dfb 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -16,7 +16,19 @@ The acquisition orders for mutexes are as follows: - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring them together is quite rare. -On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. +- Unlike kvm->slots_lock, kvm->slots_arch_lock is released before + synchronize_srcu(&kvm->srcu). Therefore kvm->slots_arch_lock + can be taken inside a kvm->srcu read-side critical section, + while kvm->slots_lock cannot. + +On x86: + +- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock + +- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and + kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and + cannot be taken without already holding kvm->arch.mmu_lock (typically with + ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). Everything else is a leaf: no other lock is taken inside the critical sections. @@ -31,25 +43,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the following two cases: 1. Access Tracking: The SPTE is not present, but it is marked for access - tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to - restore the saved R/X bits. This is described in more detail later below. + tracking. That means we need to restore the saved R/X bits. This is + described in more detail later below. -2. Write-Protection: The SPTE is present and the fault is - caused by write-protect. That means we just need to change the W bit of - the spte. +2. Write-Protection: The SPTE is present and the fault is caused by + write-protect. That means we just need to change the W bit of the spte. -What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and -SPTE_MMU_WRITEABLE bit on the spte: +What we use to avoid all the race is the Host-writable bit and MMU-writable bit +on the spte: -- SPTE_HOST_WRITEABLE means the gfn is writable on host. -- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when - the gfn is writable on guest mmu and it is not write-protected by shadow - page write-protection. +- Host-writable means the gfn is writable in the host kernel page tables and in + its KVM memslot. +- MMU-writable means the gfn is writable in the guest's mmu and it is not + write-protected by shadow page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or -restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This -is safe because whenever changing these bits can be detected by cmpxchg. +bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved +R/X bits if for an access-traced spte, or both. This is safe because whenever +changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -178,17 +189,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). Lockless Access Tracking: This is used for Intel CPUs that are using EPT but do not support the EPT A/D -bits. In this case, when the KVM MMU notifier is called to track accesses to a -page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present -by clearing the RWX bits in the PTE and storing the original R & X bits in -some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the -PTE (using the ignored bit 62). When the VM tries to access the page later on, -a fault is generated and the fast page fault mechanism described above is used -to atomically restore the PTE to a Present state. The W bit is not saved when -the PTE is marked for access tracking and during restoration to the Present -state, the W bit is set depending on whether or not it was a write access. If -it wasn't, then the W bit will remain clear until a write access happens, at -which time it will be set using the Dirty tracking mechanism described above. +bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and +when the KVM MMU notifier is called to track accesses to a page (via +kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware +by clearing the RWX bits in the PTE and storing the original R & X bits in more +unused/ignored bits. When the VM tries to access the page later on, a fault is +generated and the fast page fault mechanism described above is used to +atomically restore the PTE to a Present state. The W bit is not saved when the +PTE is marked for access tracking and during restoration to the Present state, +the W bit is set depending on whether or not it was a write access. If it +wasn't, then the W bit will remain clear until a write access happens, at which +time it will be set using the Dirty tracking mechanism described above. 3. Reference ------------ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9cbd22aa0e1c901c6df7bb67d61290ae0996f0a9..ae068c02a0c435fadeb8ed0139ff787a2af5e5ec 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -29,7 +29,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED -#define KVM_USER_MEM_SLOTS 512 #define KVM_HALT_POLL_NS_DEFAULT 500000 #include @@ -526,11 +525,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index ab3ab398fb963bbed03a381cca1221aa93dec7ac..cf827a6c330175f40b8bc5b869a631ccf9ab5791 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -5,12 +5,12 @@ #ifndef __ASM_SPINLOCK_H #define __ASM_SPINLOCK_H -#include #include #include /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 << 15) +#include /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 23a51f0bab7590bb0b8fd989583d60dd240ea150..6fa92a1435a5f5580f3ab5f3894bbb93fbdc26d9 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -872,7 +872,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk * the page we just got a reference to gets unmapped before we have a * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_hva will take it away + * unmapped afterwards, the call to kvm_unmap_gfn will take it away * from us again properly. This smp_rmb() interacts with the smp_wmb() * in kvm_mmu_notifier_invalidate_. */ @@ -1106,126 +1106,70 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) return ret; } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} - -static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - unsigned flags = *(unsigned *)data; - bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE; - - __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block); - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_unmap_hva_range(start, end); - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - kvm_pfn_t *pfn = (kvm_pfn_t *)data; - - WARN_ON(size != PAGE_SIZE); + __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); - /* - * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore we never need to clear out a huge PMD through this - * calling path and a memcache is not required. - */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE, - __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL); return 0; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); + kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_set_spte_hva(hva); + WARN_ON(range->end - range->start != 1); /* * We've moved a page around, probably through CoW, so let's treat it * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn); + + /* + * The MMU notifiers will have unmapped a huge PMD before calling + * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and + * therefore we never need to clear out a huge PMD through this + * calling path and a memcache is not required. + */ + kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, + PAGE_SIZE, __pfn_to_phys(pfn), + KVM_PGTABLE_PROT_R, NULL); + return 0; } -static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - pte_t pte; + u64 size = (range->end - range->start) << PAGE_SHIFT; kvm_pte_t kpte; + pte_t pte; + + if (!kvm->arch.mmu.pgt) + return 0; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa); + + kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT); pte = __pte(kpte); return pte_valid(pte) && pte_young(pte); } -static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa); -} - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_age_hva(start, end); - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - if (!kvm->arch.mmu.pgt) - return 0; - trace_kvm_test_age_hva(hva); - return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, - kvm_test_age_hva_handler, NULL); + return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT); } phys_addr_t kvm_mmu_get_httbr(void) diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index d5edb4cb217b05c6fc51018fa47d5efebd450e8f..07960c0ad3e913a3c6df1d2602b11c82126c386c 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -136,72 +136,6 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); -TRACE_EVENT(kvm_unmap_hva_range, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_set_spte_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) -); - -TRACE_EVENT(kvm_age_hva, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier age hva: %#016lx -- %#016lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_test_age_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) -); - TRACE_EVENT(kvm_set_way_flush, TP_PROTO(unsigned long vcpu_pc, bool cache), TP_ARGS(vcpu_pc, cache), diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 8f181f8cfa8cf467b9f337d8a61becdffc94053f..a71b5813d64f7e399dc95946b3bf248df52236a9 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -29,7 +29,6 @@ #define LOONGSON_VIRT_REG_BASE 0x1f000000 #define KVM_MAX_VCPUS 256 -#define KVM_USER_MEM_SLOTS 256 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 @@ -299,11 +298,6 @@ enum _kvm_fault_result { }; #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, bool blockable); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); static inline void update_pc(struct kvm_vcpu_arch *arch) { diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 4e4c1ddabb5d3ccbdffe526a62850c793be0ae03..cdd16862fb4c98ae3d92a67367d221ced66c7af8 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -712,7 +712,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return npages > 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { unsigned long npages; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 24f3d0f9996b1871ea60fdfe068520c5db8809fc..b70645670548f37b1f45a16fce75714d49e619f8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -83,7 +83,6 @@ #define KVM_MAX_VCPUS 16 -#define KVM_USER_MEM_SLOTS 16 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 @@ -816,14 +815,7 @@ struct kvm_mips_callbacks { int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); - void (*flush_shadow_all)(struct kvm *kvm); - /* - * Must take care of flushing any cached GPA PTEs (e.g. guest entries in - * VZ root TLB, or T&E GVA page tables and corresponding root TLB - * mappings). - */ - void (*flush_shadow_memslot)(struct kvm *kvm, - const struct kvm_memory_slot *slot); + void (*prepare_flush_shadow)(struct kvm *kvm); gpa_t (*gva_to_gpa)(gva_t gva); void (*queue_timer_int)(struct kvm_vcpu *vcpu); void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); @@ -968,11 +960,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, bool write); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); @@ -1155,4 +1143,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +int kvm_arch_flush_remote_tlb(struct kvm *kvm); + #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index 8a88eb2655160da1a5055af4249825046026ab4b..6ce2117e49f6f91b8e47f6b678f585796c80c9a9 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -10,7 +10,6 @@ #define _ASM_SPINLOCK_H #include -#include #include @@ -27,5 +26,6 @@ static inline void queued_spin_unlock(struct qspinlock *lock) } #include +#include #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3d6a7f5827b17ce6e96103304432c42d89faa52c..730b9ef0fb9aa0ccef5c250aaeeecd58ffc409b3 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -204,9 +204,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) { /* Flush whole GPA */ kvm_mips_flush_gpa_pt(kvm, 0, ~0); - - /* Let implementation do the rest */ - kvm_mips_callbacks->flush_shadow_all(kvm); + kvm_flush_remote_tlbs(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -221,8 +219,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, /* Flush slot from GPA */ kvm_mips_flush_gpa_pt(kvm, slot->base_gfn, slot->base_gfn + slot->npages - 1); - /* Let implementation do the rest */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, slot); + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); spin_unlock(&kvm->mmu_lock); } @@ -262,9 +259,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, /* Write protect GPA page table entries */ needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn, new->base_gfn + new->npages - 1); - /* Let implementation do the rest */ if (needs_flush) - kvm_mips_callbacks->flush_shadow_memslot(kvm, new); + kvm_arch_flush_remote_tlbs_memslot(kvm, new); spin_unlock(&kvm->mmu_lock); } } @@ -996,11 +992,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) } +int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + kvm_mips_callbacks->prepare_flush_shadow(kvm); + return 1; +} + void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { - /* Let implementation handle TLB/GVA invalidation */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + kvm_flush_remote_tlbs(kvm); } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index c0c5f39c34341de3aabb2c28187ab4c156a6bca9..a5ecefb9729b4a080d80861ef28359f98fe20c8f 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -438,85 +438,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, end_gfn << PAGE_SHIFT); } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, gfn_t gfn, - gpa_t gfn_end, - struct kvm_memory_slot *memslot, - void *data), - void *data) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - ret |= handler(kvm, gfn, gfn_end, memslot, data); - } - - return ret; -} - - -static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) -{ - kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end); + kvm_mips_flush_gpa_pt(kvm, range->start, range->end); return 1; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) -{ - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - - kvm_mips_callbacks->flush_shadow_all(kvm); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - gpa_t gpa = gfn << PAGE_SHIFT; - pte_t hva_pte = *(pte_t *)data; + gpa_t gpa = range->start << PAGE_SHIFT; + pte_t hva_pte = range->pte; pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); pte_t old_pte; if (!gpa_pte) - return 0; + return false; /* Mapping may need adjusting depending on memslot flags */ old_pte = *gpa_pte; - if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) + if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) hva_pte = pte_mkclean(hva_pte); - else if (memslot->flags & KVM_MEM_READONLY) + else if (range->slot->flags & KVM_MEM_READONLY) hva_pte = pte_wrprotect(hva_pte); set_pte(gpa_pte, hva_pte); /* Replacing an absent or old page doesn't need flushes */ if (!pte_present(old_pte) || !pte_young(old_pte)) - return 0; + return false; /* Pages swapped, aged, moved, or cleaned require flushes */ return !pte_present(hva_pte) || @@ -525,27 +474,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, (pte_dirty(old_pte) && !pte_dirty(hva_pte)); } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) -{ - unsigned long end = hva + PAGE_SIZE; - int ret; - - ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); - if (ret) - kvm_mips_callbacks->flush_shadow_all(kvm); - return 0; -} - -static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end); + return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end); } -static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - gpa_t gpa = gfn << PAGE_SHIFT; + gpa_t gpa = range->start << PAGE_SHIFT; pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); if (!gpa_pte) @@ -553,16 +489,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return pte_young(*gpa_pte); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); -} - /** * _kvm_mips_map_page_fast() - Fast path GPA fault handler. * @vcpu: VCPU pointer. diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 0788c00d7e94b4c54d25fd03e5f8d290de8c2e1a..5f2df497599c2db86adc97580c648ab3eccfd410 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -687,16 +687,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm) +static void kvm_trap_emul_prepare_flush_shadow(struct kvm *kvm) { - /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */ - kvm_flush_remote_tlbs(kvm); -} - -static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm, - const struct kvm_memory_slot *slot) -{ - kvm_trap_emul_flush_shadow_all(kvm); } static u64 kvm_trap_emul_get_one_regs[] = { @@ -1280,8 +1272,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .vcpu_init = kvm_trap_emul_vcpu_init, .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, - .flush_shadow_all = kvm_trap_emul_flush_shadow_all, - .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot, + .prepare_flush_shadow = kvm_trap_emul_prepare_flush_shadow, .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb, .queue_timer_int = kvm_mips_queue_timer_int_cb, .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb, diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 2ffbe9264a316e48224bd02793bb5608caebd4f5..2c75571dc4a25cc6865b611972e16f5c10c25b3a 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3211,32 +3211,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -static void kvm_vz_flush_shadow_all(struct kvm *kvm) +static void kvm_vz_prepare_flush_shadow(struct kvm *kvm) { - if (cpu_has_guestid) { - /* Flush GuestID for each VCPU individually */ - kvm_flush_remote_tlbs(kvm); - } else { + if (!cpu_has_guestid) { /* * For each CPU there is a single GPA ASID used by all VCPUs in * the VM, so it doesn't make sense for the VCPUs to handle * invalidation of these ASIDs individually. * * Instead mark all CPUs as needing ASID invalidation in - * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to + * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will * kick any running VCPUs so they check asid_flush_mask. */ cpumask_setall(&kvm->arch.asid_flush_mask); - kvm_flush_remote_tlbs(kvm); } } -static void kvm_vz_flush_shadow_memslot(struct kvm *kvm, - const struct kvm_memory_slot *slot) -{ - kvm_vz_flush_shadow_all(kvm); -} - static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -3292,8 +3282,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .vcpu_init = kvm_vz_vcpu_init, .vcpu_uninit = kvm_vz_vcpu_uninit, .vcpu_setup = kvm_vz_vcpu_setup, - .flush_shadow_all = kvm_vz_flush_shadow_all, - .flush_shadow_memslot = kvm_vz_flush_shadow_memslot, + .prepare_flush_shadow = kvm_vz_prepare_flush_shadow, .gva_to_gpa = kvm_vz_gva_to_gpa_cb, .queue_timer_int = kvm_vz_queue_timer_int_cb, .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb, diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d32ec9ae73bd4150b64c90ed2691f1426de73e7d..402963dd17f075509a4a9d296d57b2d1551eb9fb 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); -extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); -extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); -extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); +extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); extern void kvmppc_radix_flush_memslot(struct kvm *kvm, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d67a470e95a35897de701d75f8cdfb67580bf959..54c25bcbcf97e262a220eb7884098a71fbdfed98 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -28,7 +28,6 @@ #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS -#define KVM_USER_MEM_SLOTS 512 #include @@ -56,13 +55,7 @@ #include #define KVM_ARCH_WANT_MMU_NOTIFIER - -extern int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, - unsigned flags); -extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 0a056c64c317b7f61edacb313ea025ea1c3addef..5950cbe75b287dfc342864fdad8d72548c639172 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -281,11 +281,10 @@ struct kvmppc_ops { const struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); - int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, - unsigned long end); - int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); - int (*test_age_hva)(struct kvm *kvm, unsigned long hva); - void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); + bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); void (*destroy_vm)(struct kvm *kvm); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 44bf567b65899b5a99da163b98616a42b514323d..2b691f4d1f26c56b0c03ff88f1e9da062fd9b30e 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); + return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->age_hva(kvm, start, end); + return kvm->arch.kvm_ops->age_gfn(kvm, range); } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->test_age_hva(kvm, hva); + return kvm->arch.kvm_ops->test_age_gfn(kvm, range); } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); - return 0; + return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); } int kvmppc_core_init_vm(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 9b6323ec8e607f54aa22f87e19c474c68ce815b0..740e51def5a5cc47cf4d1503571968a77909b3de 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -9,12 +9,10 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot); -extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, - unsigned long end); -extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, - unsigned long end); -extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); -extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); +extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 38ea396a23d6e7e9e261effa737688c42e024f19..de295a470f7cb5d9614b253803bc7b8e77ebac76 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, } else { /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok); + writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; @@ -751,51 +751,6 @@ void kvmppc_rmap_reset(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } -typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); - -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - hva_handler_fn handler) -{ - int ret; - int retval = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for (; gfn < gfn_end; ++gfn) { - ret = handler(kvm, memslot, gfn); - retval |= ret; - } - } - - return retval; -} - -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - hva_handler_fn handler) -{ - return kvm_handle_hva_range(kvm, hva, hva + 1, handler); -} - /* Must be called with both HPTE and rmap locked */ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, struct kvm_memory_slot *memslot, @@ -839,8 +794,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, } } -static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { unsigned long i; __be64 *hptep; @@ -873,16 +828,21 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } - return 0; } -int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + gfn_t gfn; - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva_range(kvm, start, end, handler); - return 0; + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_rmapp(kvm, range->slot, range->start); + } + + return false; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, @@ -912,8 +872,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, } } -static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; @@ -967,26 +927,34 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, return ret; } -int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + gfn_t gfn; + bool ret = false; - handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; - return kvm_handle_hva_range(kvm, start, end, handler); + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_rmapp(kvm, range->slot, gfn); + } + + return ret; } -static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long *hp; - int ret = 1; + bool ret = true; unsigned long *rmapp; rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; if (*rmapp & KVMPPC_RMAP_REFERENCED) - return 1; + return true; lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) @@ -1001,27 +969,33 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, goto out; } while ((i = j) != head); } - ret = 0; + ret = false; out: unlock_rmap(rmapp); return ret; } -int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + WARN_ON(range->start + 1 != range->end); - handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; - return kvm_handle_hva(kvm, hva, handler); + if (kvm_is_radix(kvm)) + return kvm_test_age_radix(kvm, range->slot, range->start); + else + return kvm_test_age_rmapp(kvm, range->slot, range->start); } -void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + WARN_ON(range->start + 1 != range->end); + + if (kvm_is_radix(kvm)) + kvm_unmap_radix(kvm, range->slot, range->start); + else + kvm_unmap_rmapp(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva(kvm, hva, handler); + return false; } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 04028f905e50e6e34ed3a5bfe19554ddcb1e4b95..e7924664a944574f8f66e7009c97355a387ccfd7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -824,7 +824,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); + writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; @@ -995,8 +995,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; @@ -1004,24 +1004,23 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return 0; + return; } ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return 0; } /* Called with kvm->mmu_lock held */ -int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; unsigned long old, *rmapp; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) @@ -1037,26 +1036,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, old & PTE_RPN_MASK, 1UL << shift); - ref = 1; + ref = true; } return ref; } /* Called with kvm->mmu_lock held */ -int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) + { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) - ref = 1; + ref = true; return ref; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1d259323899512f3b944ecdb469230731aa99488..2c993c368cf7da40fa15f30b53db467f513a55ed 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4802,7 +4802,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) kvmhv_release_all_nested(kvm); kvmppc_rmap_reset(kvm); kvm->arch.process_table = 0; - /* Mutual exclusion with kvm_unmap_hva_range etc. */ + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 0; spin_unlock(&kvm->mmu_lock); @@ -4824,7 +4824,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) if (err) return err; kvmppc_rmap_reset(kvm); - /* Mutual exclusion with kvm_unmap_hva_range etc. */ + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 1; spin_unlock(&kvm->mmu_lock); @@ -5661,10 +5661,10 @@ static struct kvmppc_ops kvm_ops_hv = { .flush_memslot = kvmppc_core_flush_memslot_hv, .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, .commit_memory_region = kvmppc_core_commit_memory_region_hv, - .unmap_hva_range = kvm_unmap_hva_range_hv, - .age_hva = kvm_age_hva_hv, - .test_age_hva = kvm_test_age_hva_hv, - .set_spte_hva = kvm_set_spte_hva_hv, + .unmap_gfn_range = kvm_unmap_gfn_range_hv, + .age_gfn = kvm_age_gfn_hv, + .test_age_gfn = kvm_test_age_gfn_hv, + .set_spte_gfn = kvm_set_spte_gfn_hv, .free_memslot = kvmppc_core_free_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, .destroy_vm = kvmppc_core_destroy_vm_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b1fefa63e125b252fe7810d421d2a099faf057ee..893873448794b7e4bbd639c99398882207647e21 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) } /************* MMU Notifiers *************/ -static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { long i; struct kvm_vcpu *vcpu; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT); - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - kvm_for_each_vcpu(i, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, - gfn_end << PAGE_SHIFT); - } + return false; } -static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range) { - do_kvm_unmap_hva(kvm, start, end); - - return 0; + return do_kvm_unmap_gfn(kvm, range); } -static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) +static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) +static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* The page will get remapped properly on its next fault */ - do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); + return do_kvm_unmap_gfn(kvm, range); } /*****************************************/ @@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = { .flush_memslot = kvmppc_core_flush_memslot_pr, .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, .commit_memory_region = kvmppc_core_commit_memory_region_pr, - .unmap_hva_range = kvm_unmap_hva_range_pr, - .age_hva = kvm_age_hva_pr, - .test_age_hva = kvm_test_age_hva_pr, - .set_spte_hva = kvm_set_spte_hva_pr, + .unmap_gfn_range = kvm_unmap_gfn_range_pr, + .age_gfn = kvm_age_gfn_pr, + .test_age_gfn = kvm_test_age_gfn_pr, + .set_spte_gfn = kvm_set_spte_gfn_pr, .free_memslot = kvmppc_core_free_memslot_pr, .init_vm = kvmppc_core_init_vm_pr, .destroy_vm = kvmppc_core_destroy_vm_pr, diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index ed0c9c43d0cf14cc8a4fd57d72cd74b36ff644ed..7f16afc331efdb5d01f3883ae76e05209d1fc15a 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, /************* MMU Notifiers *************/ -static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - trace_kvm_unmap_hva(hva); - /* * Flush all shadow tlb entries everywhere. This is slow, but * we are 100% sure that we catch the to be unmapped page */ - kvm_flush_remote_tlbs(kvm); - - return 0; + return true; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); - - return 0; + return kvm_e500_mmu_unmap_gfn(kvm, range); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); - return 0; + return kvm_e500_mmu_unmap_gfn(kvm, range); } /*****************************************/ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h index 3837842986aa46ee4ac80f4759d1051d9221c87c..eff6e82dbcd45443fde9ae21fa28380e0df01ba7 100644 --- a/arch/powerpc/kvm/trace_booke.h +++ b/arch/powerpc/kvm/trace_booke.h @@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit, ) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - TRACE_EVENT(kvm_booke206_stlb_write, TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index dc2666b4180bee28c891498e5d75d6293d7492c7..5c68905d3614880ea5684baaf14f33a39d938ac4 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -20,7 +20,6 @@ #define KVM_MAX_VCPUS (1U << 9) #endif -#define KVM_USER_MEM_SLOTS 512 #define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_VCPU_MAX_FEATURES 0 @@ -219,11 +218,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned int flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa, unsigned long vmid); void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 56fda9ef70fdc5a8c8938c566d2623edfffb323b..311b50b36f67d8597c5b445401cf6dc5b1ee02e5 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -412,38 +412,6 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} - void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, @@ -597,94 +565,71 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return ret; } -static int kvm_unmap_hva_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) -{ - unsigned int flags = *(unsigned int *)data; - bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE; - - stage2_unmap_range(kvm, gpa, size, may_block); - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned int flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.pgd) return 0; - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) -{ - pte_t *pte = (pte_t *)data; - - WARN_ON(size != PAGE_SIZE); - stage2_set_pte(kvm, 0, NULL, gpa, pte); - + stage2_unmap_range(kvm, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); return 0; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; + int ret; + kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.pgd) return 0; - stage2_pte = pfn_pte(pfn, PAGE_WRITE_EXEC); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + WARN_ON(range->end - range->start != 1); + + ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT, + __pfn_to_phys(pfn), PAGE_SIZE, true, true); + if (ret) { + kvm_debug("Failed to map stage2 page (error %d)\n", ret); + return 1; + } return 0; } -static int kvm_age_hva_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { pte_t *ptep; u32 ptep_level = 0; + u64 size = (range->end - range->start) << PAGE_SHIFT; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); - - if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level)) + if (!kvm->arch.pgd) return 0; - return ptep_test_and_clear_young(NULL, 0, ptep); -} + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - if (!kvm->arch.pgd) + if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return 0; - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); + return ptep_test_and_clear_young(NULL, 0, ptep); } -static int kvm_test_age_hva_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { pte_t *ptep; u32 ptep_level = 0; + u64 size = (range->end - range->start) << PAGE_SHIFT; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level)) + if (!kvm->arch.pgd) return 0; - return pte_young(*ptep); -} + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - if (!kvm->arch.pgd) + if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return 0; - return handle_hva_to_gpa(kvm, hva, hva, - kvm_test_age_hva_handler, NULL); + return pte_young(*ptep); } int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 171913b9a9250905cf4246fe762615c6bcb57606..4ca1252789e8edb17555f1a15eecb2e2bf1f3123 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -28,7 +28,6 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 -#define KVM_USER_MEM_SLOTS 32 /* * These seem to be used for allocating ->chip in the routing table, which we diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h index 7fc82a233f4957c97bf8f0913a43bacd04a7eefc..3a9a0b0c74654dc4e03361046c6a4f653b87c7ab 100644 --- a/arch/sparc/include/asm/spinlock_64.h +++ b/arch/sparc/include/asm/spinlock_64.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include #endif /* !(__ASSEMBLY__) */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3dda2d99e9d8b653b0662500aa262df9b30242bf..678955cd217506efddc482185bdcb17d0c3e7ef6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,10 +40,8 @@ #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 -#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) #define KVM_HALT_POLL_NS_DEFAULT 200000 @@ -355,6 +353,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_HAVE_MMU_RWLOCK + struct kvm_mmu_page; /* @@ -660,7 +660,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - unsigned long cr3_lm_rsvd_bits; + u64 reserved_gpa_bits; int maxphyaddr; /* emulate context */ @@ -938,6 +938,13 @@ struct kvm_arch { struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; + /* + * Protects marking pages unsync during page faults, as TDP MMU page + * faults only take mmu_lock for read. For simplicity, the unsync + * pages lock is always taken when marking pages unsync regardless of + * whether mmu_lock is held for read or write. + */ + spinlock_t mmu_unsync_pages_lock; struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; @@ -1029,6 +1036,7 @@ struct kvm_arch { struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; +#ifdef CONFIG_X86_64 /* * Whether the TDP MMU is enabled for this VM. This contains a * snapshot of the TDP MMU module parameter from when the VM was @@ -1038,16 +1046,60 @@ struct kvm_arch { */ bool tdp_mmu_enabled; - /* List of struct tdp_mmu_pages being used as roots */ + /* + * List of struct kvm_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set. + * + * For reads, this list is protected by: + * the MMU lock in read mode + RCU or + * the MMU lock in write mode + * + * For writes, this list is protected by: + * the MMU lock in read mode + the tdp_mmu_pages_lock or + * the MMU lock in write mode + * + * Roots will remain in the list until their tdp_mmu_root_count + * drops to zero, at which point the thread that decremented the + * count to zero should removed the root from the list and clean + * it up, freeing the root after an RCU grace period. + */ struct list_head tdp_mmu_roots; - /* List of struct tdp_mmu_pages not being used as roots */ + + /* + * List of struct kvmp_mmu_pages not being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set and a tdp_mmu_root_count of 0. + */ struct list_head tdp_mmu_pages; + + /* + * Protects accesses to the following fields when the MMU lock + * is held in read mode: + * - tdp_mmu_roots (above) + * - tdp_mmu_pages (above) + * - the link field of struct kvm_mmu_pages used by the TDP MMU + * - lpage_disallowed_mmu_pages + * - the lpage_disallowed_link field of struct kvm_mmu_pages used + * by the TDP MMU + * It is acceptable, but not necessary, to acquire this lock when + * the thread holds the MMU lock in write mode. + */ + spinlock_t tdp_mmu_pages_lock; +#endif /* CONFIG_X86_64 */ + /* * VM-scope maximum vCPU ID. Used to determine the size of structures * that increase along with the maximum vCPU ID, in which case, using * the global KVM_MAX_VCPU_ID may lead to significant memory waste. */ u32 max_vcpu_ids; + + /* + * If set, rmaps have been allocated for all memslots and should be + * allocated for any newly created or modified memslots. + */ + bool memslots_have_rmaps; }; struct kvm_vm_stat { @@ -1403,9 +1455,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1713,11 +1762,8 @@ asmlinkage void kvm_spurious_fault(void); _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); @@ -1808,4 +1854,5 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define GET_SMSTATE(type, buf, offset) \ (*(type *)((buf) + (offset) - 0x7e00)) +int alloc_all_memslots_rmaps(struct kvm *kvm); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 1c6c9adcb730f64d0914162f45bc545cbe8e2cfc..3fc65dd3a33b678fcc6648cdab62c0acea6a295e 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o + mmu/spte.o +kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6c69733d2c98aae47c0182a8967d56282c27a469..2f67d137ff2fe2e7cf820b5704869586c11bf983 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -216,16 +216,20 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - kvm_mmu_reset_context(vcpu); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); - /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + + /* + * Except for the MMU, which needs to be reset after any vendor + * specific adjustments to the reserved GPA bits. + */ + kvm_mmu_reset_context(vcpu); } static int is_efer_nx(void) @@ -266,6 +270,16 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) return 36; } +/* + * This "raw" version returns the reserved GPA bits without any adjustments for + * encryption technologies that usurp bits. The raw mask should be used if and + * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs. + */ +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a7126dd9470d48d8b4e96078db669ebc2759b866..7ba00522c51cf1b342709a4e010fcf22b1ac1b01 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -70,15 +70,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return !(gpa & vcpu->arch.reserved_gpa_bits); +} + static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); + return !kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t alignment) +{ + return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ @@ -402,11 +419,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } -static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 581925e476d6c57ccc7af45f61761b55e8686c8e..14d9fdff1fb0235732662a4918dd111c4ed778bb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,7 +52,8 @@ static inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -117,7 +118,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * write-protects guest page to sync the guest modification, b) another one is * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences * between these two sorts are: - * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 1) the first case clears MMU-writable bit. * 2) the first case requires flushing tlb immediately avoiding corrupting * shadow page table between all vcpus so it should be in the protection of * mmu-lock. And the another case does not need to flush tlb until returning @@ -128,24 +129,24 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * So, there is the problem: the first case can meet the corrupted tlb caused * by another case which write-protects pages but without flush tlb * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit - * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * it flush tlb if we try to write-protect a spte whose MMU-writable bit + * is set, it works since another case never touches MMU-writable bit. * * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * changed) we need to check whether the spte with MMU-writable becomes * readonly, if that happens, we need to flush tlb. Fortunately, * mmu_spte_update() has already handled it perfectly. * - * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * The rules to use MMU-writable and PT_WRITABLE_MASK: * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * writable on the mmu mapping, check MMU-writable, this is the most * case, otherwise * - if we fix page fault on the spte or do write-protection by dirty logging, * check PT_WRITABLE_MASK. * * TODO: introduce APIs to split these two cases. */ -static inline int is_writable_pte(unsigned long pte) +static inline bool is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -220,4 +221,14 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + /* + * Read memslot_have_rmaps before rmap pointers. Hence, threads reading + * memslots_have_rmaps in any lock context are guaranteed to see the + * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + */ + return smp_load_acquire(&kvm->arch.memslots_have_rmaps); +} + #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 89ad6239138e65b7955660dc316b801a6e9e91ec..891dc122b69d0bdc552f78090500ac141105680d 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -56,7 +56,7 @@ extern bool itlb_multihit_kvm_mitigation; -static int __read_mostly nx_huge_pages = -1; +int __read_mostly nx_huge_pages = -1; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -210,18 +210,13 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { - u64 mask = make_mmio_spte(vcpu, gfn, access); + u64 spte = make_mmio_spte(vcpu, gfn, access); - trace_mark_mmio_spte(sptep, gfn, mask); - mmu_spte_set(sptep, mask); + trace_mark_mmio_spte(sptep, gfn, spte); + mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -239,17 +234,6 @@ static unsigned get_mmio_spte_access(u64 spte) return spte & shadow_mmio_access_mask; } -static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access) -{ - if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(vcpu, sptep, gfn, access); - return true; - } - - return false; -} - static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; @@ -630,28 +614,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -722,8 +714,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) * handling slots that are not large page aligned. */ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) + const struct kvm_memory_slot *slot, int level) { unsigned long idx; @@ -1115,7 +1106,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); if (pt_protect) - spte &= ~SPTE_MMU_WRITEABLE; + spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); @@ -1162,7 +1153,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ -static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1193,7 +1185,8 @@ static bool spte_set_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1222,9 +1215,13 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1251,13 +1248,17 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_clear_dirty(kvm, rmap_head); + __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; @@ -1293,12 +1294,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + if (kvm_memslots_have_rmaps(kvm)) { + for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); + } } - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); @@ -1313,7 +1316,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1329,26 +1333,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) return flush; } -static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head); + return kvm_zap_rmapp(kvm, rmap_head, slot); } -static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; int need_flush = 0; u64 new_spte; - pte_t *ptep = (pte_t *)data; kvm_pfn_t new_pfn; - WARN_ON(pte_huge(*ptep)); - new_pfn = pte_pfn(*ptep); + WARN_ON(pte_huge(pte)); + new_pfn = pte_pfn(pte); restart: for_each_rmap_spte(rmap_head, &iter, sptep) { @@ -1357,7 +1360,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, need_flush = 1; - if (pte_write(*ptep)) { + if (pte_write(pte)) { pte_list_remove(rmap_head, sptep); goto restart; } else { @@ -1445,92 +1448,54 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t pte); + +static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + rmap_handler_t handler) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; struct slot_rmap_walk_iterator iterator; - int ret = 0; - int i; + bool ret = false; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for_each_slot_rmap_range(memslot, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - gfn_start, gfn_end - 1, - &iterator) - ret |= handler(kvm, iterator.rmap, memslot, - iterator.gfn, iterator.level, data); - } - } + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + range->start, range->end - 1, &iterator) + ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, + iterator.level, range->pte); return ret; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, int level, - unsigned long data)) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); -} - -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) -{ - int r; + bool flush = false; - r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); - if (kvm->arch.tdp_mmu_enabled) - r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + if (is_tdp_mmu_enabled(kvm)) + flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); - return r; + return flush; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int r; + bool flush = false; - r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); - if (kvm->arch.tdp_mmu_enabled) - r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + if (is_tdp_mmu_enabled(kvm)) + flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); - return r; + return flush; } -static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1539,13 +1504,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) young |= mmu_spte_age(sptep); - trace_kvm_age_page(gfn, level, slot, young); return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, unsigned long data) +static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1567,29 +1531,33 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int young = false; + bool young = false; - young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) - young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + + if (is_tdp_mmu_enabled(kvm)) + young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int young = false; + bool young = false; + + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); - young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) - young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + if (is_tdp_mmu_enabled(kvm)) + young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; } @@ -1998,9 +1966,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); - cond_resched_lock(&vcpu->kvm->mmu_lock); + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } @@ -2441,6 +2409,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); + /* + * Note, this check is intentionally soft, it only guarantees that one + * page is available, while the caller may end up allocating as many as + * four pages, e.g. for PAE roots or for 5-level paging. Temporarily + * exceeding the (arbitrary by default) limit will not harm the host, + * being too agressive may unnecessarily kill the guest, and getting an + * exact count is far more trouble than it's worth, especially in the + * page fault paths. + */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; @@ -2452,7 +2429,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - @@ -2463,7 +2440,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -2474,7 +2451,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); @@ -2482,7 +2459,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return r; } @@ -2501,6 +2478,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + bool locked = false; if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) return true; @@ -2512,9 +2490,34 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, if (sp->unsync) continue; + /* + * TDP MMU page faults require an additional spinlock as they + * run with mmu_lock held for read, not write, and the unsync + * logic is not thread safe. Take the spinklock regardless of + * the MMU type to avoid extra conditionals/parameters, there's + * no meaningful penalty if mmu_lock is held for write. + */ + if (!locked) { + locked = true; + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * Recheck after taking the spinlock, a different vCPU + * may have since marked the page unsync. A false + * positive on the unprotected check above is not + * possible as clearing sp->unsync _must_ hold mmu_lock + * for write, i.e. unsync cannot transition from 0->1 + * while this CPU holds mmu_lock for read (or write). + */ + if (READ_ONCE(sp->unsync)) + continue; + } + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } + if (locked) + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -2567,9 +2570,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp; int ret; - if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) - return 0; - sp = sptep_to_sp(sptep); ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, @@ -2599,6 +2599,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); + if (unlikely(is_noslot_pfn(pfn))) { + mark_mmio_spte(vcpu, sptep, gfn, pte_access); + return RET_PF_EMULATE; + } + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink @@ -2632,9 +2637,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - if (unlikely(is_mmio_spte(*sptep))) - ret = RET_PF_EMULATE; - /* * The fault is fully spurious if and only if the new SPTE and old SPTE * are identical, and emulation is not required. @@ -2740,11 +2742,18 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - kvm_pfn_t pfn, struct kvm_memory_slot *slot) +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + const struct kvm_memory_slot *slot) { unsigned long hva; pte_t *pte; @@ -2763,19 +2772,37 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, */ hva = __gfn_to_hva_memslot(slot, gfn); - pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + pte = lookup_address_in_mm(kvm->mm, hva, &level); if (unlikely(!pte)) return PG_LEVEL_4K; return level; } +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int max_level) +{ + struct kvm_lpage_info *linfo; + + max_level = min(max_level, max_huge_page_level); + for ( ; max_level > PG_LEVEL_4K; max_level--) { + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) + break; + } + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + return host_pfn_mapping_level(kvm, gfn, pfn, slot); +} + int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; @@ -2792,17 +2819,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_huge_page_level); - for ( ; max_level > PG_LEVEL_4K; max_level--) { - linfo = lpage_info_slot(gfn, slot, max_level); - if (!linfo->disallow_lpage) - break; - } - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); + level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); if (level == PG_LEVEL_4K) return level; @@ -2862,9 +2879,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -2938,9 +2952,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return true; } - if (unlikely(is_noslot_pfn(pfn))) + if (unlikely(is_noslot_pfn(pfn))) { vcpu_cache_mmio_info(vcpu, gva, gfn, access & shadow_mmio_access_mask); + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!shadow_mmio_value)) { + *ret_val = RET_PF_EMULATE; + return true; + } + } return false; } @@ -3030,15 +3054,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + +/* + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + */ +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +{ struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3049,11 +3098,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); - sp = sptep_to_sp(iterator.sptep); + if (!is_shadow_present_pte(spte)) + break; + + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3111,8 +3164,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3125,8 +3177,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3144,12 +3195,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (WARN_ON(!sp)) return; - if (kvm_mmu_put_root(kvm, sp)) { - if (sp->tdp_mmu_page) - kvm_tdp_mmu_free_root(kvm, sp); - else if (sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); - } + if (is_tdp_mmu_page(sp)) + kvm_tdp_mmu_put_root(kvm, sp, false); + else if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); *root_hpa = INVALID_PAGE; } @@ -3176,7 +3225,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3199,7 +3248,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3220,192 +3269,224 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - spin_lock(&vcpu->kvm->mmu_lock); - - if (make_mmu_pages_available(vcpu)) { - spin_unlock(&vcpu->kvm->mmu_lock); - return INVALID_PAGE; - } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { - u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u8 shadow_root_level = mmu->shadow_root_level; hpa_t root; unsigned i; + int r; - if (vcpu->kvm->arch.tdp_mmu_enabled) { - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + write_lock(&vcpu->kvm->mmu_lock); + r = make_mmu_pages_available(vcpu); + if (r < 0) + goto out_unlock; - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + if (is_tdp_mmu_enabled(vcpu->kvm)) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + mmu->root_hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, - true); - - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { + if (WARN_ON_ONCE(!mmu->pae_root)) { + r = -EIO; + goto out_unlock; + } + for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); + WARN_ON_ONCE(mmu->pae_root[i] && + VALID_PAGE(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; + mmu->pae_root[i] = root | PT_PRESENT_MASK | + shadow_me_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); - } else - BUG(); + mmu->root_hpa = __pa(mmu->pae_root); + } else { + WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); + r = -EIO; + goto out_unlock; + } /* root_pgd is ignored for direct MMUs. */ - vcpu->arch.mmu->root_pgd = 0; - - return 0; + mmu->root_pgd = 0; +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); + return r; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { - u64 pdptr, pm_mask; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; hpa_t root; - int i; + unsigned i; + int r; - root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; + /* + * On SVM, reading PDPTRs might access guest memory, which might fault + * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. + */ + if (mmu->root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + pdptrs[i] = mmu->get_pdptr(vcpu, i); + if (!(pdptrs[i] & PT_PRESENT_MASK)) + continue; + + if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) + return 1; + } + } + + r = alloc_all_memslots_rmaps(vcpu->kvm); + if (r) + return r; + + write_lock(&vcpu->kvm->mmu_lock); + r = make_mmu_pages_available(vcpu); + if (r < 0) + goto out_unlock; + /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); - + if (mmu->root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - vcpu->arch.mmu->shadow_root_level, false); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->shadow_root_level, false); + mmu->root_hpa = root; goto set_root_pgd; } + if (WARN_ON_ONCE(!mmu->pae_root)) { + r = -EIO; + goto out_unlock; + } + /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ - pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + pm_mask = PT_PRESENT_MASK | shadow_me_mask; + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - /* - * Allocate the page for the PDPTEs when shadowing 32-bit NPT - * with 64-bit only when needed. Unlike 32-bit NPT, it doesn't - * need to be in low mem. See also pml4_root below. - */ - if (!vcpu->arch.mmu->pae_root) { - WARN_ON_ONCE(!tdp_enabled); - - vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mmu->pae_root) - return -ENOMEM; + if (WARN_ON_ONCE(!mmu->pml4_root)) { + r = -EIO; + goto out_unlock; } + + mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; } for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); - if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); - if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu->pae_root[i] = 0; + WARN_ON_ONCE(mmu->pae_root[i] && VALID_PAGE(mmu->pae_root[i])); + + if (mmu->root_level == PT32E_ROOT_LEVEL) { + if (!(pdptrs[i] & PT_PRESENT_MASK)) { + mmu->pae_root[i] = 0; continue; } - root_gfn = pdptr >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + root_gfn = pdptrs[i] >> PAGE_SHIFT; } root = mmu_alloc_root(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, false); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | pm_mask; + mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); - /* - * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP - * tables are allocated and initialized at MMU creation as there is no - * equivalent level in the guest's NPT to shadow. Allocate the tables - * on demand, as running a 32-bit L1 VMM is very rare. The PDP is - * handled above (to share logic with PAE), deal with the PML4 here. - */ - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu->pml4_root == NULL) { - u64 *pml4_root; + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + mmu->root_hpa = __pa(mmu->pml4_root); + else + mmu->root_hpa = __pa(mmu->pae_root); - pml4_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) - return -ENOMEM; +set_root_pgd: + mmu->root_pgd = root_pgd; +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); - pml4_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + return r; +} - vcpu->arch.mmu->pml4_root = pml4_root; - } +static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pml4_root); - } -#ifdef CONFIG_X86_64 - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_5LEVEL) { - if (vcpu->arch.mmu->pml4_root == NULL) { - u64 *pml4_root; - pml4_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) - return -ENOMEM; + /* + * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP + * tables are allocated and initialized at root creation as there is no + * equivalent level in the guest's NPT to shadow. Allocate the tables + * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. + */ + if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || + mmu->shadow_root_level < PT64_ROOT_4LEVEL) + return 0; - pml4_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root) + return 0; - vcpu->arch.mmu->pml4_root = pml4_root; - } - if (vcpu->arch.mmu->pml5_root == NULL) { - u64 *pml5_root; + /* + * The special roots should always be allocated in concert. Yell and + * bail if KVM ends up in a state where only one of the roots is valid. + */ + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + mmu->pml5_root)) + return -EIO; - pml5_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml5_root) - return -ENOMEM; + /* Unlike 32-bit NPT, the PDP table doesn't need to be in low mem. */ + pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pae_root) + return -ENOMEM; - pml5_root[0] = __pa(vcpu->arch.mmu->pml4_root) | pm_mask; +#ifdef CONFIG_X86_64 + if (!pml4_root) + goto err_pml4; - vcpu->arch.mmu->pml5_root = pml5_root; - } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pml5_root); + if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; } #endif -set_root_pgd: - vcpu->arch.mmu->root_pgd = root_pgd; + mmu->pae_root = pae_root; + mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; return 0; -} -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu->direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3439,17 +3520,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) !smp_load_acquire(&sp->unsync_children)) return; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); mmu_sync_children(vcpu, sp); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { @@ -3463,7 +3544,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); @@ -3515,6 +3596,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3522,8 +3605,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3531,65 +3612,63 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); - sptes[leaf - 1] = spte; + sptes[leaf] = spte; if (!is_shadow_present_pte(spte)) break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } -/* return true if reserved bit is detected on spte. */ +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { - u64 sptes[PT64_ROOT_MAX_LEVEL]; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; int root, leaf, level; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { - *sptep = 0ull; - return reserved; - } + walk_shadow_page_lockless_begin(vcpu); - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; } + *sptep = sptes[leaf]; + + /* + * Skip reserved bits checks on the terminal leaf if it's not a valid + * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by + * design, always have reserved bits set. The purpose of the checks is + * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. + */ + if (!is_shadow_present_pte(sptes[leaf])) + leaf++; + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; - for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level - 1])) - break; - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the - * reserved bit and EPT's invalid memtype/XWR checks to avoid - * adding a Jcc in the loop. - */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) || - __is_rsvd_bits_set(rsvd_check, sptes[level - 1], - level); - } + for (level = root; level >= leaf; level--) + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) || + __is_rsvd_bits_set(rsvd_check, sptes[level], level); if (reserved) { - pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) - pr_err("------ spte 0x%llx level %d.\n", - sptes[level - 1], level); + pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", + sptes[level], level, + rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); } - *sptep = sptes[leaf - 1]; - return reserved; } @@ -3687,8 +3766,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, + bool write, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3709,7 +3788,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, + write, writable, hva); if (!async) return false; /* *pfn has correct page already */ @@ -3723,29 +3803,30 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, + write, writable, hva); return false; } static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { + bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; + hva_t hva; int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -3754,21 +3835,27 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + + if (is_tdp_mmu_fault) + read_lock(&vcpu->kvm->mmu_lock); + else + write_lock(&vcpu->kvm->mmu_lock); + + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, pfn, prefault); else @@ -3776,7 +3863,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + if (is_tdp_mmu_fault) + read_unlock(&vcpu->kvm->mmu_lock); + else + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -4020,20 +4110,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, static void __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, int level, bool nx, bool gbpages, + u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { - u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; + u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; - if (!nx) - exb_bit_rsvd = rsvd_bits(63, 63); if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); + if (level == PT32E_ROOT_LEVEL) + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); + else + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + + /* Note, NX doesn't exist in PDPTEs, this is handled below. */ + if (!nx) + high_bits_rsvd |= rsvd_bits(63, 63); + /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. @@ -4062,45 +4159,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 63) | - rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PDE */ - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PTE */ - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | + high_bits_rsvd | + rsvd_bits(5, 8) | + rsvd_bits(1, 2); /* PDPTE */ + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: - rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: - rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | + gbpages_bit_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 29); - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | + gbpages_bit_rsvd | + rsvd_bits(13, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; @@ -4111,8 +4202,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), context->root_level, - context->nx, + vcpu->arch.reserved_gpa_bits, + context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), guest_cpuid_is_amd_or_hygon(vcpu)); @@ -4120,27 +4211,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, bool execonly) + u64 pa_bits_rsvd, bool execonly) { + u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 bad_mt_xwr; - rsvd_check->rsvd_bits_mask[0][4] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][3] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4159,7 +4245,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), execonly); + vcpu->arch.reserved_gpa_bits, execonly); +} + +static inline u64 reserved_hpa_bits(void) +{ + return rsvd_bits(shadow_phys_bits, 63); } /* @@ -4189,7 +4280,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + reserved_hpa_bits(), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4226,14 +4317,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - shadow_phys_bits, - false); + reserved_hpa_bits(), false); if (!shadow_me_mask) return; @@ -4253,7 +4343,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - shadow_phys_bits, execonly); + reserved_hpa_bits(), execonly); } #define BYTE_MASK(access) \ @@ -4880,10 +4970,18 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; - r = mmu_alloc_roots(vcpu); - kvm_mmu_sync_roots(vcpu); + r = mmu_alloc_special_roots(vcpu); + if (r) + goto out; + if (vcpu->arch.mmu->direct_map) + r = mmu_alloc_direct_roots(vcpu); + else + r = mmu_alloc_shadow_roots(vcpu); if (r) goto out; + + kvm_mmu_sync_roots(vcpu); + kvm_mmu_load_pgd(vcpu); kvm_x86_ops.tlb_flush_current(vcpu); out: @@ -5044,7 +5142,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ mmu_topup_memory_caches(vcpu, true); - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5076,7 +5174,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) @@ -5261,51 +5359,46 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) + gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; - bool flush = false; for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + flush |= fn(kvm, iterator.rmap, memslot); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush && flush_on_yield) { kvm_flush_remote_tlbs_with_address(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); } } - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs_with_address(kvm, start_gfn, - end_gfn - start_gfn + 1); - flush = false; - } - return flush; } static __always_inline bool slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) + bool flush_on_yield) { return slot_handle_level_range(kvm, memslot, fn, start_level, end_level, memslot->base_gfn, memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); + flush_on_yield, false); } static __always_inline bool @@ -5326,10 +5419,10 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - PG_LEVEL_4K, lock_flush_tlb); + PG_LEVEL_4K, flush_on_yield); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -5440,7 +5533,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } @@ -5474,7 +5567,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* @@ -5486,6 +5579,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; + /* In order to ensure all threads see this change when + * handling the MMU reload signal, this must happen in the + * same critical section as kvm_reload_remote_mmus, and + * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages + * could drop the MMU lock and yield. + */ + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_invalidate_all_roots(kvm); + /* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new @@ -5498,10 +5600,21 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); - if (kvm->arch.tdp_mmu_enabled) - kvm_tdp_mmu_zap_all(kvm); + write_unlock(&kvm->mmu_lock); - spin_unlock(&kvm->mmu_lock); + /* + * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before + * returning to the caller, e.g. if the zap is in response to a memslot + * deletion, mmu_notifier callbacks will be unable to reach the SPTEs + * associated with the deleted memslot once the update completes, and + * Deferring the zap until the final reference to the root is put would + * lead to use-after-free. + */ + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_zap_invalidated_roots(kvm); + read_unlock(&kvm->mmu_lock); + } } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) @@ -5520,7 +5633,15 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - kvm_mmu_init_tdp_mmu(kvm); + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + + if (!kvm_mmu_init_tdp_mmu(kvm)) + /* + * No smp_load/store wrappers needed here as we are in + * VM init and there cannot be any memslots / other threads + * accessing this struct kvm yet. + */ + kvm->arch.memslots_have_rmaps = true; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -5541,37 +5662,49 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; - bool flush; - - spin_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; + bool flush = false; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + flush = slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, + end - 1, true, flush); + } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } - if (kvm->arch.tdp_mmu_enabled) { - flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (is_tdp_mmu_enabled(kvm)) { + flush = false; + + read_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, + gfn_end, flush, true); if (flush) - kvm_flush_remote_tlbs(kvm); - } + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end); - spin_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); + } } static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } @@ -5580,14 +5713,21 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, int start_level) { - bool flush; + bool flush = false; - spin_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - if (kvm->arch.tdp_mmu_enabled) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); - spin_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, + false); + write_unlock(&kvm->mmu_lock); + } + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + read_unlock(&kvm->mmu_lock); + } /* * We can flush all the TLBs out of the mmu lock without TLB @@ -5596,16 +5736,17 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * spte from present to present (changing the spte from present * to nonpresent will flush all the TLBs immediately), in other * words, the only case we care is mmu_spte_update() where we - * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE - * instead of PT_WRITABLE_MASK, that means it does not depend - * on PT_WRITABLE_MASK anymore. + * have checked Host-writable | MMU-writable instead of + * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK + * anymore. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5626,8 +5767,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - (kvm_is_zone_device_pfn(pfn) || - PageCompound(pfn_to_page(pfn)))) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + pfn, PG_LEVEL_NUM)) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -5647,13 +5788,24 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - spin_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, - kvm_mmu_zap_collapsible_spte, true); + struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; + bool flush = false; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + write_unlock(&kvm->mmu_lock); + } - if (kvm->arch.tdp_mmu_enabled) - kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + read_unlock(&kvm->mmu_lock); + } } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, @@ -5661,7 +5813,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, { /* * All current use cases for flushing the TLBs for a specific memslot - * are related to dirty logging, and do the TLB flush out of mmu_lock. + * related to dirty logging, and many do the TLB flush out of mmu_lock. * The interaction between the various operations on memslot must be * serialized by slots_locks to ensure the TLB flush from one operation * is observed by any other operation on the same memslot. @@ -5674,13 +5826,20 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush; + bool flush = false; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, + false); + write_unlock(&kvm->mmu_lock); + } - spin_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); + } /* * It's also safe to flush TLBs out of mmu lock here as currently this @@ -5698,12 +5857,18 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); - if (kvm->arch.tdp_mmu_enabled) + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); + write_unlock(&kvm->mmu_lock); + } + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); - spin_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); + } if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5715,11 +5880,17 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + write_unlock(&kvm->mmu_lock); + } + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); + } if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5732,23 +5903,23 @@ void kvm_mmu_zap_all(struct kvm *kvm) LIST_HEAD(invalid_list); int ign; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - if (cond_resched_lock(&kvm->mmu_lock)) + if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) @@ -5808,7 +5979,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) continue; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm_has_zapped_obsolete_pages(kvm)) { kvm_mmu_commit_zap_page(kvm, @@ -5819,7 +5990,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); /* @@ -5853,25 +6024,6 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - - /* - * Set a reserved PA bit in MMIO SPTEs to generate page faults with - * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT - * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports - * 52-bit physical addresses then there are no reserved PA bits in the - * PTEs and so the reserved PA approach must be disabled. - */ - if (shadow_phys_bits < 52) - mask = BIT_ULL(51) | PT_PRESENT_MASK; - else - mask = 0; - - kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK); -} - static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -5949,8 +6101,6 @@ int kvm_mmu_vendor_module_init(void) kvm_mmu_reset_all_pte_masks(); - kvm_set_mmio_spte_mask(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); @@ -6044,6 +6194,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel static void kvm_recover_nx_lpages(struct kvm *kvm) { + unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6052,10 +6203,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) break; @@ -6069,22 +6220,23 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - if (sp->tdp_mmu_page) { + + if (is_tdp_mmu_page(sp)) { flush |= kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); flush = false; } } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bfc6389edc28a4c0d0d27ce95cc33276efe04b27..b6a08efad25daed41df1d53918988561411b5540 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -26,7 +26,13 @@ struct kvm_mmu_page { struct list_head lpage_disallowed_link; bool unsync; - u8 mmu_valid_gen; + + union { + u8 mmu_valid_gen; + + /* Only accessed under slots_lock. */ + bool tdp_mmu_scheduled_root_to_zap; + }; bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ @@ -40,7 +46,11 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - int root_count; /* Currently serving as active root */ + /* Currently serving as active root */ + union { + int root_count; + refcount_t tdp_mmu_root_count; + }; unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); @@ -56,7 +66,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; +#ifdef CONFIG_X86_64 bool tdp_mmu_page; + + /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + struct rcu_head rcu_head; +#endif }; extern struct kmem_cache *mmu_page_header_cache; @@ -73,18 +88,36 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) +{ + return role.smm ? 1 : 0; +} + +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return kvm_mmu_role_as_id(sp->role); +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* * When using the EPT page-modification log, the GPAs in the log * would come from L2 rather than L1. Therefore, we need to rely * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. + * PML, since writes now result in a vmexit. Note, this helper will + * tag SPTEs as needing write-protection even if PML is disabled or + * unsupported, but that's ok because the tag is consumed if and only + * if PML is enabled. Omit the PML check to save a few uops. */ return vcpu->arch.mmu == &vcpu->arch.guest_mmu; } -bool is_nx_huge_page_enabled(void); +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); @@ -95,22 +128,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); -static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - BUG_ON(!sp->root_count); - lockdep_assert_held(&kvm->mmu_lock); - - ++sp->root_count; -} - -static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - lockdep_assert_held(&kvm->mmu_lock); - --sp->root_count; - - return !sp->root_count; -} - /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). * @@ -119,6 +136,9 @@ static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, @@ -133,14 +153,15 @@ enum { #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level); void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, kvm_pfn_t *pfnp, int *goal_levelp); -bool is_nx_huge_page_enabled(void); - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 213699b27b44877ae47cb91864eb08087a5c761c..bbae3488f8e014d4fe04bfa28bbbcc2a1da41134 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ @@ -381,6 +387,35 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_tdp_mmu_spte_changed, + TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte), + TP_ARGS(as_id, gfn, level, old_spte, new_spte), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, old_spte) + __field(u64, new_spte) + /* Level cannot be larger than 5 on x86, so it fits in a u8. */ + __field(u8, level) + /* as_id can only be 0 or 1 x86, so it fits in a u8. */ + __field(u8, as_id) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->old_spte = old_spte; + __entry->new_spte = new_spte; + __entry->level = level; + __entry->as_id = as_id; + ), + + TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx", + __entry->as_id, __entry->gfn, __entry->level, + __entry->old_spte, __entry->new_spte + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 81cf4babbd0b475cddc204267b9ea87503b79e3f..20e62efd6457b09437a68b4bed15413e8764552c 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_del_rcu(&n->node); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c6daeeff1d9c9d958e08375c8baaedc295f7069f..20a269bce3594876a019cf84a0d42faa478b61c6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -502,6 +502,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, #endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + walker->fault.async_page_fault = false; trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; @@ -600,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); @@ -790,6 +798,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; + hva_t hva; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; int max_level; @@ -840,8 +849,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -868,8 +877,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, } r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + write_lock(&vcpu->kvm->mmu_lock); + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); @@ -881,7 +890,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -919,7 +928,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -954,7 +963,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ @@ -1076,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; + host_writable = sp->spt[i] & shadow_host_writable_mask; set_spte_ret |= set_spte(vcpu, &sp->spt[i], pte_access, PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c51ad544f25b21a3ccb8758fd2503ff6d52bf270..9e0eaf9da41f10dc678833bf041bc2c01ff66c53 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,13 +16,17 @@ #include "spte.h" #include +#include +u64 __read_mostly shadow_host_writable_mask; +u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; u64 __read_mostly shadow_me_mask; @@ -38,7 +42,6 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -48,16 +51,18 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); + u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; + WARN_ON_ONCE(!shadow_mmio_value); + access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + spte |= shadow_mmio_value | access; + spte |= gpa | shadow_nonpresent_or_rsvd_mask; + spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; - return mask; + return spte; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) @@ -86,13 +91,13 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, bool can_unsync, bool host_writable, bool ad_disabled, u64 *new_spte) { - u64 spte = 0; + u64 spte = SPTE_MMU_PRESENT_MASK; int ret = 0; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -124,7 +129,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, kvm_is_mmio_pfn(pfn)); if (host_writable) - spte |= SPTE_HOST_WRITEABLE; + spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; @@ -134,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; /* * Optimization: for pte sync, if spte was writable the hash @@ -150,7 +155,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } } @@ -161,19 +166,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: + WARN_ON(is_mmio_spte(spte)); *new_spte = spte; return ret; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { - u64 spte; + u64 spte = SPTE_MMU_PRESENT_MASK; - spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; + spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -188,7 +194,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_host_writable_mask; new_spte = mark_spte_for_access_track(new_spte); @@ -242,53 +248,65 @@ u64 mark_spte_for_access_track(u64 spte) return spte; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + + /* + * Disable MMIO caching if the MMIO value collides with the bits that + * are used to hold the relocated GFN when the L1TF mitigation is + * enabled. This should never fire as there is no known hardware that + * can trigger this condition, e.g. SME/SEV CPUs that require a custom + * MMIO value are not susceptible to L1TF. + */ + if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << + SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) + mmio_value = 0; + + /* + * The masked MMIO value must obviously match itself and a removed SPTE + * must not get a false positive. Removed SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and removed SPTEs must + * not set any RWX bits. + */ + if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || + WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + mmio_value = 0; + + shadow_mmio_value = mmio_value; + shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; + shadow_user_mask = VMX_EPT_READABLE_MASK; + shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; + shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_nx_mask = 0ull; + shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; + shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; + shadow_me_mask = 0ull; + + shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; + + /* + * EPT Misconfigurations are generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + */ + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, + VMX_EPT_RWX_MASK, 0); } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; + u64 mask; shadow_phys_bits = kvm_get_shadow_phys_bits(); @@ -315,4 +333,30 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); + + shadow_user_mask = PT_USER_MASK; + shadow_accessed_mask = PT_ACCESSED_MASK; + shadow_dirty_mask = PT_DIRTY_MASK; + shadow_nx_mask = PT64_NX_MASK; + shadow_x_mask = 0; + shadow_present_mask = PT_PRESENT_MASK; + shadow_acc_track_mask = 0; + shadow_me_mask = sme_me_mask; + + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + + /* + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. + */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 667f207d3d099fda6493f10b50787aefb25ec400..6925dfc389817aa7fd5b4fe8482f93582cc98487 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,18 +5,33 @@ #include "mmu_internal.h" -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 +/* + * A MMU present SPTE is backed by actual memory and may or may not be present + * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it + * is ignored by all flavors of SPTEs and checking a low bit often generates + * better code than for a high bit, e.g. 56+. MMU present checks are pervasive + * enough that the improved code generation is noticeable in KVM's footprint. + */ +#define SPTE_MMU_PRESENT_MASK BIT_ULL(11) /* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. + * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also + * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. + * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that + * is must be employed for a given TDP SPTE. + * + * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE + * paging, including NPT PAE. This scheme works because legacy shadow paging + * is guaranteed to have A/D bits and write-protection is forced only for + * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it + * must be restricted to 64-bit KVM. */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) +#define SPTE_TDP_AD_SHIFT 52 +#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -45,16 +60,46 @@ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. This mask obviously + * must not overlap the A/D type mask. + */ +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ + PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 +#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) +static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* + * Low ignored bits are at a premium for EPT, use high ignored bits, taking care + * to not overlap the A/D type mask or the saved access bits of access-tracked + * SPTEs when A/D bits are disabled. + */ +#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) +#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* Defined only to keep the above static asserts readable. */ +#undef SHADOW_ACC_TRACK_SAVED_MASK /* - * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62 + * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 + * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -65,39 +110,44 @@ */ #define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_END 10 -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) +static_assert(!(SPTE_MMU_PRESENT_MASK & + (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +extern u64 __read_mostly shadow_host_writable_mask; +extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -115,14 +165,26 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 /* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. + * If a thread running without exclusive control of the MMU lock must perform a + * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * non-present intermediate value. Other threads which encounter this value + * should not modify the SPTE. + * + * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on + * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * vulnerability. Use only low bits to avoid 64-bit immediates. + * + * Only used by the TDP MMU. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) -#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT +#define REMOVED_SPTE 0x5a0ULL + +/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); + +static inline bool is_removed_spte(u64 spte) +{ + return spte == REMOVED_SPTE; +} /* * In some cases, we need to preserve the GFN of a non-present or reserved @@ -142,7 +204,13 @@ extern u8 __read_mostly shadow_phys_bits; static inline bool is_mmio_spte(u64 spte) { - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; + return (spte & shadow_mmio_mask) == shadow_mmio_value && + likely(shadow_mmio_value); +} + +static inline bool is_shadow_present_pte(u64 pte) +{ + return !!(pte & SPTE_MMU_PRESENT_MASK); } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -152,25 +220,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; + MMU_WARN_ON(!is_shadow_present_pte(spte)); + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; + MMU_WARN_ON(!is_shadow_present_pte(spte)); + /* + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * and non-TDP SPTEs will never set these bits. Optimize for 64-bit + * TDP and do the A/D type check unconditionally. + */ + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -179,23 +252,14 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static inline int is_large_pte(u64 pte) +static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; } -static inline int is_last_spte(u64 pte, int level) +static inline bool is_last_spte(u64 pte, int level) { - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; + return (level == PG_LEVEL_4K) || is_large_pte(pte); } static inline bool is_executable_pte(u64 spte) @@ -225,8 +289,8 @@ static inline bool is_dirty_spte(u64 spte) static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); + return (spte & shadow_host_writable_mask) && + (spte & shadow_mmu_writable_mask); } static inline u64 get_mmio_spte_generation(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 1a09d212186b31be36971bdabfdc7f1815269768..b3ed302c1a359fbe5b46b9aad8f6c0bcc5e7944d 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); } static gfn_t round_gfn_for_level(gfn_t gfn, int level) @@ -20,6 +20,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) return gfn & -KVM_PAGES_PER_HPAGE(level); } +/* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + /* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. @@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; - iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = root_pt; - - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); - tdp_iter_refresh_sptep(iter); + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); - iter->valid = true; + tdp_iter_restart(iter); } /* @@ -48,7 +59,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, * address of the child page table referenced by the SPTE. Returns null if * there is no such entry. */ -u64 *spte_to_child_pt(u64 spte, int level) +tdp_ptep_t spte_to_child_pt(u64 spte, int level) { /* * There's no child entry if this entry isn't present or is a @@ -57,7 +68,7 @@ u64 *spte_to_child_pt(u64 spte, int level) if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) return NULL; - return __va(spte_to_pfn(spte) << PAGE_SHIFT); + return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); } /* @@ -66,7 +77,7 @@ u64 *spte_to_child_pt(u64 spte, int level) */ static bool try_step_down(struct tdp_iter *iter) { - u64 *child_pt; + tdp_ptep_t child_pt; if (iter->level == iter->min_level) return false; @@ -75,7 +86,7 @@ static bool try_step_down(struct tdp_iter *iter) * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) @@ -109,7 +120,7 @@ static bool try_step_side(struct tdp_iter *iter) iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); iter->next_last_level_gfn = iter->gfn; iter->sptep++; - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); return true; } @@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -u64 *tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index d480c540ee27dfc7c4213fd0e4d70c864bced0c9..b1748b988d3aef556087b26ffad29accbd2756ff 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -7,6 +7,8 @@ #include "mmu.h" +typedef u64 __rcu *tdp_ptep_t; + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ @@ -23,9 +25,9 @@ struct tdp_iter { */ gfn_t yielded_gfn; /* Pointers to the page tables traversed to reach the current SPTE */ - u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ - u64 *sptep; + tdp_ptep_t sptep; /* The lowest GFN mapped by the current SPTE */ gfn_t gfn; /* The level of the root page given to the iterator */ @@ -34,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -55,11 +59,11 @@ struct tdp_iter { #define for_each_tdp_pte(iter, root, root_level, start, end) \ for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) -u64 *spte_to_child_pt(u64 pte, int level); +tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -u64 *tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 073514bbb5f715684d8ef584583d6dc27a732c9c..27e18b82f8d5c272f304153bfbe2fb05794741f8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,31 +7,35 @@ #include "tdp_mmu.h" #include "spte.h" -#ifdef CONFIG_X86_64 +#include +#include + static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); -#endif - -static bool is_tdp_mmu_enabled(void) -{ -#ifdef CONFIG_X86_64 - return tdp_enabled && READ_ONCE(tdp_mmu_enabled); -#else - return false; -#endif /* CONFIG_X86_64 */ -} /* Initializes the TDP MMU for the VM, if enabled. */ -void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - if (!is_tdp_mmu_enabled()) - return; + if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) + return false; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + + return true; +} + +static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, + bool shared) +{ + if (shared) + lockdep_assert_held_read(&kvm->mmu_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); } void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) @@ -40,88 +44,120 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) return; WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* + * Ensure that all the outstanding RCU callbacks to free shadow pages + * can run before the VM is torn down. + */ + rcu_barrier(); } -static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared); + +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { - if (kvm_mmu_put_root(kvm, root)) - kvm_tdp_mmu_free_root(kvm, root); + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); } -static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, - struct kvm_mmu_page *root) +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) { - lockdep_assert_held(&kvm->mmu_lock); - - if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) - return false; - - kvm_mmu_get_root(kvm, root); - return true; + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); + tdp_mmu_free_sp(sp); } -static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, - struct kvm_mmu_page *root) +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) { - struct kvm_mmu_page *next_root; - - next_root = list_next_entry(root, link); - tdp_mmu_put_root(kvm, root); - return next_root; -} + gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); -/* - * Note: this iterator gets and puts references to the roots it iterates over. - * This makes it safe to release the MMU lock and yield within the loop, but - * if exiting the loop early, the caller must drop the reference to the most - * recent root. (Unless keeping a live reference is desirable.) - */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ - for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ - typeof(*_root), link); \ - tdp_mmu_next_root_valid(_kvm, _root); \ - _root = tdp_mmu_next_root(_kvm, _root)) + kvm_lockdep_assert_mmu_lock_held(kvm, shared); -#define for_each_tdp_mmu_root(_kvm, _root) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) + return; -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) -{ - struct kvm_mmu_page *sp; + WARN_ON(!root->tdp_mmu_page); - if (!kvm->arch.tdp_mmu_enabled) - return false; - if (WARN_ON(!VALID_PAGE(hpa))) - return false; + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_del_rcu(&root->link); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; + zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); - return sp->tdp_mmu_page && sp->root_count; + call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush); - -void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) +/* + * Finds the next valid root after root (or the first valid root if root + * is NULL), takes a reference on it, and returns that next root. If root + * is not NULL, this thread should have already taken a reference on it, and + * that reference will be dropped. If no valid root is found, this + * function will return NULL. + */ +static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + struct kvm_mmu_page *prev_root, + bool shared) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + struct kvm_mmu_page *next_root; - lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); - WARN_ON(root->root_count); - WARN_ON(!root->tdp_mmu_page); + if (prev_root) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &prev_root->link, + typeof(*prev_root), link); + else + next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); - list_del(&root->link); + while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &next_root->link, typeof(*next_root), link); - zap_gfn_range(kvm, root, 0, max_gfn, false, false); + rcu_read_unlock(); - free_page((unsigned long)root->spt); - kmem_cache_free(mmu_page_header_cache, root); + if (prev_root) + kvm_tdp_mmu_put_root(kvm, prev_root, shared); + + return next_root; } +/* + * Note: this iterator gets and puts references to the roots it iterates over. + * This makes it safe to release the MMU lock and yield within the loop, but + * if exiting the loop early, the caller must drop the reference to the most + * recent root. (Unless keeping a live reference is desirable.) + * + * If shared is set, this function is operating under the MMU lock in read + * mode. In the unlikely event that this thread must free a root, the lock + * will be temporarily dropped and reacquired in write mode. + */ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ + } else + +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ + lockdep_is_held_type(&kvm->mmu_lock, 0) || \ + lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ + } else + static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, int level) { @@ -149,66 +185,51 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, sp->gfn = gfn; sp->tdp_mmu_page = true; + trace_kvm_mmu_get_page(sp, true); + return sp; } -static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { union kvm_mmu_page_role role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + lockdep_assert_held_write(&kvm->mmu_lock); - spin_lock(&kvm->mmu_lock); + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); /* Check for an existing root before allocating a new one. */ - for_each_tdp_mmu_root(kvm, root) { - if (root->role.word == role.word) { - kvm_mmu_get_root(kvm, root); - spin_unlock(&kvm->mmu_lock); - return root; - } + for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { + if (root->role.word == role.word && + kvm_tdp_mmu_get_root(kvm, root)) + goto out; } root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); - root->root_count = 1; + refcount_set(&root->tdp_mmu_root_count, 1); - list_add(&root->link, &kvm->arch.tdp_mmu_roots); - - spin_unlock(&kvm->mmu_lock); - - return root; -} - -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *root; - - root = get_tdp_mmu_vcpu_root(vcpu); - if (!root) - return INVALID_PAGE; + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +out: return __pa(root->spt); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level); - -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) -{ - return sp->role.smm ? 1 : 0; -} + u64 old_spte, u64 new_spte, int level, + bool shared); static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { - bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) return; if (is_accessed_spte(old_spte) && - (!is_accessed_spte(new_spte) || pfn_changed)) + (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } @@ -230,6 +251,142 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } } +/** + * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the new page + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. + * @account_nx: This page replaces a NX large page and should be marked for + * eventual reclaim. + */ +static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared, bool account_nx) +{ + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); + + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +/** + * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the page to be removed + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. + */ +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) +{ + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); + + list_del(&sp->link); + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +/** + * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * + * @kvm: kvm instance + * @pt: the page removed from the paging structure + * @shared: This operation may not be running under the exclusive use + * of the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. + * + * Given a page table that has been removed from the TDP paging structure, + * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. + */ +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, + bool shared) +{ + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); + int level = sp->role.level; + gfn_t base_gfn = sp->gfn; + int i; + + trace_kvm_mmu_prepare_zap_page(sp); + + tdp_mmu_unlink_page(kvm, sp, shared); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; + + if (shared) { + /* + * Set the SPTE to a nonpresent value that other + * threads will not overwrite. If the SPTE was + * already marked as removed then another thread + * handling a page fault could overwrite it, so + * set the SPTE until it is set from some other + * value to the removed SPTE value. + */ + for (;;) { + old_child_spte = xchg(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_child_spte)) + break; + cpu_relax(); + } + } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ + old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; + + /* + * Marking the SPTE as a removed SPTE is not + * strictly necessary here as the MMU lock will + * stop other threads from concurrently modifying + * this SPTE. Using the removed SPTE value keeps + * the two branches consistent and simplifies + * the function. + */ + WRITE_ONCE(*sptep, REMOVED_SPTE); + } + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, + old_child_spte, REMOVED_SPTE, level, + shared); + } + + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, + KVM_PAGES_PER_HPAGE(level + 1)); + + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -238,22 +395,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change * @level: the level of the PT the SPTE is part of in the paging structure + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. * * Handle bookkeeping that might result from the modification of a SPTE. * This function must be called for all TDP SPTE modifications. */ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - u64 *pt; - struct kvm_mmu_page *sp; - u64 old_child_spte; - int i; WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -285,6 +442,15 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (old_spte == new_spte) return; + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + + if (is_large_pte(old_spte) != is_large_pte(new_spte)) { + if (is_large_pte(old_spte)) + atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); + else + atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); + } + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -292,15 +458,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE, it is - * unexpected. Log the change, though it should not impact the - * guest since both the former and current SPTEs are nonpresent. + * If this change does not involve a MMIO SPTE or removed SPTE, + * it is unexpected. Log the change, though it should not + * impact the guest since both the former and current SPTEs + * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + if (WARN_ON(!is_mmio_spte(old_spte) && + !is_mmio_spte(new_spte) && + !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" - "are MMIO SPTEs.\n" + "are MMIO SPTEs, or the new SPTE is\n" + "a temporary removed SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -308,64 +478,149 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_leaf && is_dirty_spte(old_spte) && - (!is_dirty_spte(new_spte) || pfn_changed)) + (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. */ - if (was_present && !was_leaf && (pfn_changed || !is_present)) { - pt = spte_to_child_pt(old_spte, level); - sp = sptep_to_sp(pt); + if (was_present && !was_leaf && (pfn_changed || !is_present)) + handle_removed_tdp_mmu_page(kvm, + spte_to_child_pt(old_spte, level), shared); +} - list_del(&sp->link); +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, + shared); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); +/* + * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping, but do not mark the page dirty + * in KVM's dirty bitmaps. + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + lockdep_assert_held_read(&kvm->mmu_lock); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - old_child_spte = READ_ONCE(*(pt + i)); - WRITE_ONCE(*(pt + i), 0); - handle_changed_spte(kvm, as_id, - gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), - old_child_spte, 0, level - 1); - } + /* + * Do not change removed SPTEs. Only the thread that froze the SPTE + * may modify it. + */ + if (is_removed_spte(iter->old_spte)) + return false; + + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ + if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, + new_spte) != iter->old_spte) + return false; - kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); + handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); - free_page((unsigned long)pt); - kmem_cache_free(mmu_page_header_cache, sp); - } + return true; } -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) + return false; + + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, + iter->old_spte, new_spte, iter->level); + return true; +} + +static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) +{ + /* + * Freeze the SPTE by setting it to a special, + * non-present value. This will stop other threads from + * immediately installing a present entry in its place + * before the TLBs are flushed. + */ + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + return false; + + kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, + KVM_PAGES_PER_HPAGE(iter->level)); + + /* + * No other thread can overwrite the removed SPTE as they + * must either wait on the MMU lock or use + * tdp_mmu_set_spte_atomic which will not overrite the + * special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present + * to non-present. + */ + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); + + return true; } + +/* + * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * @record_acc_track: Notify the MM subsystem of changes to the accessed state + * of the page. Should be set unless handling an MMU + * notifier for access tracking. Leaving record_acc_track + * unset in that case prevents page accesses from being + * double counted. + * @record_dirty_log: Record the page as dirty in the dirty bitmap if + * appropriate for the change being made. Should be set + * unless performing certain dirty logging operations. + * Leaving record_dirty_log unset in that case prevents page + * writes from being double counted. + */ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); + lockdep_assert_held_write(&kvm->mmu_lock); - WRITE_ONCE(*iter->sptep, new_spte); + /* + * No thread should be using this function to set SPTEs to or from the + * temporary removed SPTE value. + * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic + * should be used. If operating under the MMU lock in write mode, the + * use of the removed SPTE should not be necessary. + */ + WARN_ON(is_removed_spte(iter->old_spte) || is_removed_spte(new_spte)); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level); + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); + + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } @@ -420,23 +675,29 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * Return false if a yield was not needed. */ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush) + struct tdp_iter *iter, bool flush, + bool shared) { /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + rcu_read_unlock(); + if (flush) kvm_flush_remote_tlbs(kvm); - cond_resched_lock(&kvm->mmu_lock); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); + + rcu_read_lock(); WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, - iter->next_last_level_gfn); + tdp_iter_restart(iter); return true; } @@ -449,22 +710,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. Note, in some use cases a flush may be - * required by prior actions. Ensure the pending flush is performed prior to - * yielding. + * operation can cause a soft lockup. + * + * If shared is true, this thread holds the MMU lock in read mode and must + * account for the possibility that other threads are modifying the paging + * structures concurrently. If shared is false, this thread should hold the + * MMU lock in write mode. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush) + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared) { struct tdp_iter iter; + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { +retry: if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { flush = false; continue; } @@ -482,10 +753,20 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - flush = true; + if (!shared) { + tdp_mmu_set_spte(kvm, &iter, 0); + flush = true; + } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } } + rcu_read_unlock(); return flush; } @@ -494,15 +775,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * + * If shared is true, this thread holds the MMU lock in read mode and must + * account for the possibility that other threads are modifying the paging + * structures concurrently. If shared is false, this thread should hold the + * MMU in write mode. */ -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, - bool can_yield) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, + gfn_t end, bool can_yield, bool flush, + bool shared) { struct kvm_mmu_page *root; - bool flush = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) - flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, + shared); return flush; } @@ -510,13 +797,128 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - bool flush; + bool flush = false; + int i; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, + flush, false); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, + struct kvm_mmu_page *prev_root) +{ + struct kvm_mmu_page *next_root; + + if (prev_root) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &prev_root->link, + typeof(*prev_root), link); + else + next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); + + while (next_root && !(next_root->role.invalid && + refcount_read(&next_root->tdp_mmu_root_count))) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &next_root->link, + typeof(*next_root), link); + + return next_root; +} + +/* + * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast + * zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a + * reference to each invalidated root, roots will not be freed until after this + * function drops the gifted reference, e.g. so that vCPUs don't get stuck with + * tearing down paging structures. + */ +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + struct kvm_mmu_page *next_root; + struct kvm_mmu_page *root; + bool flush = false; + + lockdep_assert_held_read(&kvm->mmu_lock); + + rcu_read_lock(); + + root = next_invalidated_root(kvm, NULL); + + while (root) { + next_root = next_invalidated_root(kvm, root); + + if (!root->tdp_mmu_scheduled_root_to_zap) + goto next; + + root->tdp_mmu_scheduled_root_to_zap = false; + + rcu_read_unlock(); + + flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, + true); + + /* + * Put the reference acquired in + * kvm_tdp_mmu_invalidate_all_roots + */ + kvm_tdp_mmu_put_root(kvm, root, true); + +next: + root = next_root; + + rcu_read_lock(); + } + + rcu_read_unlock(); - flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); if (flush) kvm_flush_remote_tlbs(kvm); } +/* + * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that + * is about to be zapped, e.g. in response to a memslots update. The caller is + * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual + * zapping. + * + * Take a reference on all roots to prevent the root from being freed before it + * is zapped by this thread. Freeing a root is not a correctness issue, but if + * a vCPU drops the last reference to a root prior to the root being zapped, it + * will get stuck with tearing down the entire paging structure. + * + * Get a reference even if the root is already invalid, + * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all + * invalid roots, e.g. there's no epoch to identify roots that were invalidated + * by a previous call. Roots stay on the list until the last reference is + * dropped, so even though all invalid roots are zapped, a root may not go away + * for quite some time, e.g. if a vCPU blocks across multiple memslot updates. + * + * Because mmu_lock is held for write, it should be impossible to observe a + * root with zero refcount, i.e. the list of roots cannot be stale. + * + * This has essentially the same effect for the TDP MMU + * as updating mmu_valid_gen does for the shadow MMU. + */ +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +{ + struct kvm_mmu_page *root; + + lockdep_assert_held_write(&kvm->mmu_lock); + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + if (!root ->role.invalid && + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(kvm, root))) { + root->tdp_mmu_scheduled_root_to_zap = true; + root->role.invalid = true; + } + } +} + /* * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) @@ -530,10 +932,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, int ret = RET_PF_FIXED; int make_spte_ret = 0; - if (unlikely(is_noslot_pfn(pfn))) { + if (unlikely(is_noslot_pfn(pfn))) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); - trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); - } else + else make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, @@ -541,8 +942,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else - tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + return RET_PF_RETRY; /* * If the page fault was caused by a write but the page is write @@ -556,11 +957,20 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) + if (unlikely(is_mmio_spte(new_spte))) { + trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, + new_spte); ret = RET_PF_EMULATE; + } else { + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); + } - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); - if (!prefault) + /* + * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be + * consistent with legacy MMU behavior. + */ + if (ret != RET_PF_SPURIOUS) vcpu->stat.pf_fixed++; return ret; @@ -588,15 +998,13 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int level; int req_level; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); + + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { if (nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(iter.old_spte, gfn, @@ -612,227 +1020,197 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - tdp_mmu_set_spte(vcpu->kvm, &iter, 0); - - kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, - KVM_PAGES_PER_HPAGE(iter.level)); + if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) + break; /* * The iter must explicitly re-read the spte here * because the new value informs the !present * path below. */ - iter.old_spte = READ_ONCE(*iter.sptep); + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); } if (!is_shadow_present_pte(iter.old_spte)) { - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); - list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + /* + * If SPTE has been forzen by another thread, just + * give up and retry, avoiding unnecessary page table + * allocation and free. + */ + if (is_removed_spte(iter.old_spte)) + break; + + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); child_pt = sp->spt; - clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - trace_kvm_mmu_get_page(sp, true); - if (huge_page_disallowed && req_level >= iter.level) - account_huge_nx_page(vcpu->kvm, sp); - - tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, + new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, true, + huge_page_disallowed && + req_level >= iter.level); + + trace_kvm_mmu_get_page(sp, true); + } else { + tdp_mmu_free_sp(sp); + break; + } } } - if (WARN_ON(iter.level != level)) + if (iter.level != level) { + rcu_read_unlock(); return RET_PF_RETRY; + } ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, pfn, prefault); + rcu_read_unlock(); return ret; } -static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned long data, - int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long data)) +bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; struct kvm_mmu_page *root; - int ret = 0; - int as_id; - - for_each_tdp_mmu_root_yield_safe(kvm, root) { - as_id = kvm_mmu_page_as_id(root); - slots = __kvm_memslots(kvm, as_id); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - ret |= handler(kvm, memslot, root, gfn_start, - gfn_end, data); - } - } + for_each_tdp_mmu_root(kvm, root, range->slot->as_id) + flush |= zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); - return ret; + return flush; } -static int zap_gfn_range_hva_wrapper(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long unused) -{ - return zap_gfn_range(kvm, root, start, end, false, false); -} +typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range); -int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end) +static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, + struct kvm_gfn_range *range, + tdp_handler_t handler) { - return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, - zap_gfn_range_hva_wrapper); + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; + + rcu_read_lock(); + + /* + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. + */ + for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) + ret |= handler(kvm, &iter, range); + } + + rcu_read_unlock(); + + return ret; } /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. */ -static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, gfn_t end, - unsigned long unused) +static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - int young = 0; u64 new_spte = 0; - tdp_root_for_each_leaf_pte(iter, root, start, end) { - /* - * If we have a non-accessed entry we don't need to change the - * pte. - */ - if (!is_accessed_spte(iter.old_spte)) - continue; - - new_spte = iter.old_spte; + /* If we have a non-accessed entry we don't need to change the pte. */ + if (!is_accessed_spte(iter->old_spte)) + return false; - if (spte_ad_enabled(new_spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)&new_spte); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + new_spte = iter->old_spte; - new_spte = mark_spte_for_access_track(new_spte); - } - new_spte &= ~shadow_dirty_mask; + if (spte_ad_enabled(new_spte)) { + new_spte &= ~shadow_accessed_mask; + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); - tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); - young = 1; + new_spte = mark_spte_for_access_track(new_spte); } - return young; + tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); + + return true; } -int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, - age_gfn_range); + return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); } -static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, - unsigned long unused2) +static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) - if (is_accessed_spte(iter.old_spte)) - return 1; - - return 0; + return is_accessed_spte(iter->old_spte); } -int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, - test_age_gfn); + return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); } -/* - * Handle the changed_pte MMU notifier for the TDP MMU. - * data is a pointer to the new pte_t mapping the HVA specified by the MMU - * notifier. - * Returns non-zero if a flush is needed before releasing the MMU lock. - */ -static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, - unsigned long data) +static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - pte_t *ptep = (pte_t *)data; - kvm_pfn_t new_pfn; u64 new_spte; - int need_flush = 0; - - WARN_ON(pte_huge(*ptep)); - new_pfn = pte_pfn(*ptep); - - tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { - if (iter.level != PG_LEVEL_4K) - continue; - - if (!is_shadow_present_pte(iter.old_spte)) - break; + /* Huge pages aren't expected to be modified without first being zapped. */ + WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); - tdp_mmu_set_spte(kvm, &iter, 0); - - kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + if (iter->level != PG_LEVEL_4K || + !is_shadow_present_pte(iter->old_spte)) + return false; - if (!pte_write(*ptep)) { - new_spte = kvm_mmu_changed_pte_notifier_make_spte( - iter.old_spte, new_pfn); + /* + * Note, when changing a read-only SPTE, it's not strictly necessary to + * zero the SPTE before setting the new PFN, but doing so preserves the + * invariant that the PFN of a present * leaf SPTE can never change. + * See __handle_changed_spte(). + */ + tdp_mmu_set_spte(kvm, iter, 0); - tdp_mmu_set_spte(kvm, &iter, new_spte); - } + if (!pte_write(range->pte)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, + pte_pfn(range->pte)); - need_flush = 1; + tdp_mmu_set_spte(kvm, iter, new_spte); } - if (need_flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - - return 0; + return true; } -int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, - pte_t *host_ptep) +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, - (unsigned long)host_ptep, - set_tdp_spte); + bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); + + /* FIXME: return 'flush' instead of flushing here. */ + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); + + return false; } /* - * Remove write access from all the SPTEs mapping GFNs [start, end). If - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. + * Remove write access from all SPTEs at or above min_level that map GFNs + * [start, end). Returns true if an SPTE has been changed and the TLBs need to + * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) @@ -841,22 +1219,36 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); for_each_tdp_pte_min_level(iter, root->spt, root->role.level, min_level, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + !is_last_spte(iter.old_spte, iter.level) || + !(iter.old_spte & PT_WRITABLE_MASK)) continue; new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, + new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } + + rcu_read_unlock(); return spte_set; } @@ -869,17 +1261,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); - } return spte_set; } @@ -898,8 +1286,11 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte)) @@ -917,9 +1308,19 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; } - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, + new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } + + rcu_read_unlock(); return spte_set; } @@ -933,17 +1334,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - } return spte_set; } @@ -961,6 +1358,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; u64 new_spte; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) @@ -970,6 +1369,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, !(mask & (1UL << (iter.gfn - gfn)))) continue; + mask &= ~(1UL << (iter.gfn - gfn)); + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -983,9 +1384,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); - - mask &= ~(1UL << (iter.gfn - gfn)); } + + rcu_read_unlock(); } /* @@ -1001,16 +1402,10 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool wrprot) { struct kvm_mmu_page *root; - int root_as_id; - - lockdep_assert_held(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); - } } /* @@ -1025,19 +1420,31 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; - tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + rcu_read_lock(); + + tdp_root_for_each_leaf_pte(iter, root, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (!is_shadow_present_pte(iter.old_spte)) + if (!is_shadow_present_pte(iter.old_spte) || + iter.old_spte & shadow_dirty_mask) continue; new_spte = iter.old_spte | shadow_dirty_mask; - tdp_mmu_set_spte(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } + rcu_read_unlock(); return spte_set; } @@ -1049,17 +1456,13 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - } return spte_set; } @@ -1067,17 +1470,22 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static void zap_collapsible_spte_range(struct kvm *kvm, +static bool zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) + const struct kvm_memory_slot *slot, + bool flush) { + gfn_t start = slot->base_gfn; + gfn_t end = start + slot->npages; struct tdp_iter iter; kvm_pfn_t pfn; - bool spte_set = false; + + rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { - spte_set = false; +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; } @@ -1087,42 +1495,47 @@ static void zap_collapsible_spte_range(struct kvm *kvm, pfn = spte_to_pfn(iter.old_spte); if (kvm_is_reserved_pfn(pfn) || - (!PageCompound(pfn_to_page(pfn)) && - !kvm_is_zone_device_pfn(pfn))) + iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, + pfn, PG_LEVEL_NUM)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - - spte_set = true; + if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } + flush = true; } - if (spte_set) - kvm_flush_remote_tlbs(kvm); + rcu_read_unlock(); + + return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot, + bool flush) { struct kvm_mmu_page *root; - int root_as_id; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); - zap_collapsible_spte_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - } + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + flush = zap_collapsible_spte_range(kvm, root, slot, flush); + + return flush; } /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1132,9 +1545,11 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { new_spte = iter.old_spte & - ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); if (new_spte == iter.old_spte) break; @@ -1143,35 +1558,34 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); + return spte_set; } /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - lockdep_assert_held(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn); - } + return spte_set; } /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1185,8 +1599,45 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; - sptes[leaf - 1] = iter.old_spte; + sptes[leaf] = iter.old_spte; } return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index a7a3f6db263d2cf901e0a1f9cd739cf6a629ee72..8197ba2173096e549f393e3899e4c6f556f85299 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,45 +5,56 @@ #include -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); - -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, - bool can_yield); -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, - gfn_t end) +__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + if (root->role.invalid) + return false; + + return refcount_inc_not_zero(&root->tdp_mmu_root_count); +} + +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared); + +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, + gfn_t end, bool can_yield, bool flush, + bool shared); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, + gfn_t start, gfn_t end, bool flush, + bool shared) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, + shared); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1); /* * Don't allow yielding, as the caller may have pending pages to zap * on the shadow MMU. */ - return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), + sp->gfn, end, false, false, false); } + void kvm_tdp_mmu_zap_all(struct kvm *kvm); +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, bool prefault); -int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end); - -int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end); -int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); - -int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, - pte_t *host_ptep); +bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush); +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int min_level); @@ -54,13 +65,56 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn, unsigned long mask, bool wrprot); bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot, + bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); + +#ifdef CONFIG_X86_64 +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } + +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) +{ + struct kvm_mmu_page *sp; + hpa_t hpa = mmu->root_hpa; + + if (WARN_ON(!VALID_PAGE(hpa))) + return false; + + /* + * A NULL shadow page is legal when shadowing a non-paging guest with + * PAE paging, as the MMU will be direct with root_hpa pointing at the + * pae_root page, not a shadow page. + */ + sp = to_shadow_page(hpa); + return sp && is_tdp_mmu_page(sp) && sp->root_count; +} +#else +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } +#endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 7f0059aa30e164232b088ced1a33068445656093..a9356e97d0a28fddf15de978ae262a9b4572c940 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* variable MTRRs */ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); - mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { /* MTRR base */ if (!valid_mtrr_type(data & 0xff)) @@ -355,14 +355,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (var_mtrr_range_is_valid(cur)) list_del(&mtrr_state->var_ranges[index].node); - /* Extend the mask with all 1 bits to the left, since those - * bits must implicitly be 0. The bits are then cleared - * when reading them. + /* + * Set all illegal GPA bits in the mask, since those bits must + * implicitly be 0. The bits are then cleared when reading them. */ if (!is_mtrr_mask) cur->base = data; else - cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu)); + cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { @@ -430,7 +430,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; - *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1; + *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } return 0; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 0c164ca686c78c60ac1e1147daa3ba99feebb489..f85580a9aded18ce6a68a14914000036cbfce44f 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -275,10 +275,10 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) if (vmcb12_lma) { if (!(vmcb12->save.cr4 & X86_CR4_PAE) || !(vmcb12->save.cr0 & X86_CR0_PE) || - (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; return true; @@ -372,7 +372,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9f6d8ea4b57cf0a3c05ad51c4b84c38393de1dd2..bd59fdc0686f441634be5eaac4a231bf9be8b453 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -870,7 +870,7 @@ static __init void svm_adjust_mmio_mask(void) */ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); } static void svm_hardware_teardown(void) @@ -3777,7 +3777,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) { best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); if (best) - vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f)); + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } if (!kvm_vcpu_apicv_active(vcpu)) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1686e85bff13f0fa634fc3afbb9cc1cb00f8b73b..26f4febd2e4ce6ec3143d9d1ffa3b8fab7d8d003 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -748,8 +748,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, (CC(!nested_cpu_has_vid(vmcs12)) || CC(!nested_exit_intr_ack_set(vcpu)) || CC((vmcs12->posted_intr_nv & 0xff00)) || - CC((vmcs12->posted_intr_desc_addr & 0x3f)) || - CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) + CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -762,13 +761,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { - int maxphyaddr; - if (count == 0) return 0; - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) + + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || + !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; return 0; @@ -1066,14 +1063,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, } } -static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long invalid_mask; - - invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - return (val & invalid_mask) == 0; -} - /* * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't @@ -1125,7 +1114,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, enum vm_entry_failure_code *entry_failure_code) { - if (CC(!nested_cr3_valid(vcpu, cr3))) { + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2194,15 +2183,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); /* - * The PML address never changes, so it is constant in vmcs02. - * Conceptually we want to copy the PML index from vmcs01 here, - * and then back to vmcs01 on nested vmexit. But since we flush - * the log and reset GUEST_PML_INDEX on each vmexit, the PML - * index is also effectively constant in vmcs02. + * PML is emulated for L2, but never enabled in hardware as the MMU + * handles A/D emulation. Disabling PML for L2 also avoids having to + * deal with filtering out L2 GPAs from the buffer. */ if (enable_pml) { - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write64(PML_ADDRESS, 0); + vmcs_write16(GUEST_PML_INDEX, -1); } if (cpu_has_vmx_encls_vmexit()) @@ -2241,7 +2228,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, struct vmcs12 *vmcs12) { - u32 exec_control, vmcs12_exec_ctrl; + u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) @@ -2317,11 +2304,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & - ~SECONDARY_EXEC_ENABLE_PML; - exec_control |= vmcs12_exec_ctrl; - } + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + exec_control |= vmcs12->secondary_vm_exec_control; + + /* PML is emulated and never enabled in hardware for L2. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; @@ -2671,7 +2658,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ switch (new_eptp & VMX_EPTP_MT_MASK) { @@ -2702,7 +2688,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ @@ -2897,7 +2883,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) + CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || @@ -5931,7 +5917,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_PREEMPTION_TIMER: return true; case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ + /* + * PML is emulated for an L1 VMM and should never be enabled in + * vmcs02, always "handle" PML_FULL by exiting to userspace. + */ return true; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 23e68c7161daf3abf1f11000505c5f04a6a2e43b..546be0ee1d4e16a8201bdc31ba48149a64dc59ab 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1204,7 +1204,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ - return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); + return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) @@ -3364,7 +3364,7 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) /* * We operate under the default treatment of SMM, so VMX cannot be * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_valid_cr4(). + * handled by kvm_is_valid_cr4(). */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; @@ -4579,15 +4579,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vmx->secondary_exec_control = exec_control; } -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); -} - static inline int vmx_get_pid_table_order(struct kvm *kvm) { return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); @@ -5853,18 +5844,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - ept_set_mmio_spte_mask(); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -6380,9 +6359,10 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been - * flushed already. + * flushed already. Note, PML is never enabled in hardware while + * running L2. */ - if (enable_pml) + if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); /* @@ -6398,6 +6378,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return handle_invalid_guest_state(vcpu); if (is_guest_mode(vcpu)) { + /* + * PML is never enabled when running L2, bail immediately if a + * PML full exit occurs as something is horribly wrong. + */ + if (exit_reason.basic == EXIT_REASON_PML_FULL) + goto unexpected_vmexit; + /* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC @@ -8395,7 +8382,8 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) - vmx_enable_tdp(); + kvm_mmu_set_ept_masks(enable_ept_ad_bits, + cpu_has_vmx_ept_execute_only()); if (!enable_ept) ept_lpage_level = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c5f09a8a5c9b8fe1840b6edcda7b17de2bdfb2d..e3ff20bf1ec027e16b88131512cc9e98d6766387 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -470,7 +470,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) @@ -827,8 +827,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { - return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | - rsvd_bits(1, 2); + return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* @@ -1056,20 +1055,17 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) - return -EINVAL; + return false; if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return -EINVAL; - - if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4)) - return -EINVAL; + return false; - return 0; + return kvm_x86_ops.is_valid_cr4(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_valid_cr4); +EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1078,7 +1074,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) X86_CR4_SMEP; unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; - if (kvm_valid_cr4(vcpu, cr4)) + if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -1131,8 +1127,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && - (cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; else if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) @@ -7548,9 +7543,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (vcpu->arch.mmu->direct_map) { unsigned int indirect_shadow_pages; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); if (indirect_shadow_pages) kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -8412,9 +8407,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) @@ -10095,7 +10087,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); -static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* @@ -10103,21 +10095,20 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) * 64-bit mode (though maybe in a 32-bit code segment). * CR4.PAE and EFER.LMA must be set. */ - if (!(sregs->cr4 & X86_CR4_PAE) - || !(sregs->efer & EFER_LMA)) - return -EINVAL; - if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) - return -EINVAL; + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return false; + if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) + return false; } else { /* * Not in 64-bit mode: EFER.LMA is clear and the code * segment cannot be 64-bit. */ if (sregs->efer & EFER_LMA || sregs->cs.l) - return -EINVAL; + return false; } - return kvm_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -10128,7 +10119,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) struct desc_ptr dt; int ret = -EINVAL; - if (kvm_valid_sregs(vcpu, sregs)) + if (!kvm_is_valid_sregs(vcpu, sregs)) goto out; apic_base_msr.data = sregs->apic_base; @@ -10453,7 +10444,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -10994,17 +10985,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +static void memslot_rmap_free(struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; + } +} - if (i == 0) - continue; +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + int i; + memslot_rmap_free(slot); + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -11012,11 +11009,79 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, - unsigned long npages) +static int memslot_rmap_alloc(struct kvm_memory_slot *slot, + unsigned long npages) { + const int sz = sizeof(*slot->arch.rmap[0]); int i; + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + int level = i + 1; + int lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + WARN_ON(slot->arch.rmap[i]); + + slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + if (!slot->arch.rmap[i]) { + memslot_rmap_free(slot); + return -ENOMEM; + } + } + + return 0; +} + +int alloc_all_memslots_rmaps(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r, i; + + /* + * Check if memslots alreday have rmaps early before acquiring + * the slots_arch_lock below. + */ + if (kvm_memslots_have_rmaps(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Read memslots_have_rmaps again, under the slots arch lock, + * before allocating the rmaps + */ + if (kvm_memslots_have_rmaps(kvm)) { + mutex_unlock(&kvm->slots_arch_lock); + return 0; + } + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + r = memslot_rmap_alloc(slot, slot->npages); + if (r) { + mutex_unlock(&kvm->slots_arch_lock); + return r; + } + } + } + + /* + * Ensure that memslots_have_rmaps becomes true strictly after + * all the rmap pointers are set. + */ + smp_store_release(&kvm->arch.memslots_have_rmaps, true); + mutex_unlock(&kvm->slots_arch_lock); + return 0; +} + +static int kvm_alloc_memslot_metadata(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages) +{ + int i, r; + /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The * old arrays will be freed by __kvm_set_memory_region() if installing @@ -11024,7 +11089,13 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, */ memset(&slot->arch, 0, sizeof(slot->arch)); - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + if (kvm_memslots_have_rmaps(kvm)) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + } + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; @@ -11033,14 +11104,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; - slot->arch.rmap[i] = - kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -11070,12 +11133,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -11104,7 +11164,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(memslot, + return kvm_alloc_memslot_metadata(kvm, memslot, mem->memory_size >> PAGE_SHIFT); return 0; } @@ -11115,12 +11175,20 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, enum kvm_mr_change change) { /* - * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. - * See comments below. + * Nothing to do for RO slots (which can't be dirtied and can't be made + * writable) or CREATE/MOVE/DELETE of a slot. See comments below. */ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) return; + /* + * READONLY and non-flags changes were filtered out above, and the only + * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty + * logging isn't being toggled on or off. + */ + if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + return; + /* * Dirty logging tracks sptes in 4k granularity, meaning that large * sptes have to be split. If live migration is successful, the guest @@ -11138,8 +11206,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot() */ - if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); /* diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 20a1f446acdf728193eb89b80faf48a229843ced..de81c17afd6ba474f97952c12ace741159fa0f7d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -378,7 +378,7 @@ static inline bool kvm_dr6_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index 584b0de6f2ca21fe1552eba57e0e73f226250208..41c449ece2d89a78313a95b73767b0f0a94d46cb 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -12,8 +12,8 @@ #define _XTENSA_SPINLOCK_H #include -#include #include +#include #define smp_mb__after_spinlock() smp_mb() diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 778eb8cab61097eca795f2b4ce5e613dda61ae90..26c56c3a08a3a0d59626de843c3a5f6591cc6b75 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1703,7 +1703,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn) return -EINVAL; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvmgt_gfn_is_write_protected(info, gfn)) goto out; @@ -1712,7 +1712,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn) kvmgt_protect_table_add(info, gfn); out: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); return 0; } @@ -1737,7 +1737,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) return -EINVAL; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (!kvmgt_gfn_is_write_protected(info, gfn)) goto out; @@ -1746,7 +1746,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) kvmgt_protect_table_del(info, gfn); out: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); return 0; } @@ -1772,7 +1772,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvmgt_guest_info *info = container_of(node, struct kvmgt_guest_info, track_node); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < slot->npages; i++) { gfn = slot->base_gfn + i; if (kvmgt_gfn_is_write_protected(info, gfn)) { @@ -1781,7 +1781,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, kvmgt_protect_table_del(info, gfn); } } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm) diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index e9cad598189a729c06c176a8a299a3dfd3505885..8e200a1e0b671ef10e24fd7c6e72e720baad8adf 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -314,7 +314,12 @@ static int vm_walk_host_range(unsigned long long start, unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; pic->gpa_to_hva = 0; + +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif down_read(&walk->mm->mmap_lock); local_irq_disable(); ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, @@ -537,13 +542,21 @@ static int ept_page_range(struct page_idle_ctrl *pic, WARN_ON(addr >= end); +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock_irq(&pic->kvm->mmu_lock); +#else spin_lock_irq(&pic->kvm->mmu_lock); +#endif vcpu = kvm_get_vcpu(pic->kvm, 0); if (!vcpu) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-VCPU"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif return -EINVAL; } @@ -551,7 +564,11 @@ static int ept_page_range(struct page_idle_ctrl *pic, if (!VALID_PAGE(mmu->root_hpa)) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-HPA"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif return -EINVAL; } @@ -567,7 +584,11 @@ static int ept_page_range(struct page_idle_ctrl *pic, * and RET_RESCAN_FLAG will be set in ret value */ if (!(err & RET_RESCAN_FLAG)) +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif else err &= ~RET_RESCAN_FLAG; @@ -584,7 +605,11 @@ static int ept_idle_supports_cpu(struct kvm *kvm) if (!vcpu) return -EINVAL; +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else spin_lock(&kvm->mmu_lock); +#endif mmu = kvm_arch_mmu_pointer(vcpu); if (kvm_mmu_ad_disabled(mmu)) { printk(KERN_NOTICE "CPU does not support EPT A/D bits tracking\n"); @@ -595,7 +620,11 @@ static int ept_idle_supports_cpu(struct kvm *kvm) ret = -EINVAL; } else ret = 0; +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else spin_unlock(&kvm->mmu_lock); +#endif return ret; } @@ -724,9 +753,17 @@ static int arm_page_range(struct page_idle_ctrl *pic, WARN_ON(addr >= end); +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&pic->kvm->mmu_lock); +#else spin_lock(&pic->kvm->mmu_lock); +#endif pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&pic->kvm->mmu_lock); +#else spin_unlock(&pic->kvm->mmu_lock); +#endif local_irq_disable(); do { @@ -1045,10 +1082,19 @@ static int page_scan_release(struct inode *inode, struct file *file) goto out; } #ifdef CONFIG_X86_64 +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else spin_lock(&kvm->mmu_lock); +#endif + kvm_flush_remote_tlbs(kvm); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else spin_unlock(&kvm->mmu_lock); #endif +#endif out: module_put(THIS_MODULE); diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 3aefde23dceab8b87d54f8c7e4493dc1a2b54eeb..c0e26b824081315aed985350445430e1b88e6943 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -15,6 +15,8 @@ #include +/* Must be included from asm/spinlock.h after defining arch_spin_is_locked. */ + /* * Writer states & reader shift and bias. */ @@ -116,15 +118,26 @@ static inline void queued_write_unlock(struct qrwlock *lock) smp_store_release(&lock->wlocked, 0); } +/** + * queued_rwlock_is_contended - check if the lock is contended + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock contended, 0 otherwise + */ +static inline int queued_rwlock_is_contended(struct qrwlock *lock) +{ + return arch_spin_is_locked(&lock->wait_lock); +} + /* * Remapping rwlock architecture specific functions to the corresponding * queue rwlock functions. */ -#define arch_read_lock(l) queued_read_lock(l) -#define arch_write_lock(l) queued_write_lock(l) -#define arch_read_trylock(l) queued_read_trylock(l) -#define arch_write_trylock(l) queued_write_trylock(l) -#define arch_read_unlock(l) queued_read_unlock(l) -#define arch_write_unlock(l) queued_write_unlock(l) +#define arch_read_lock(l) queued_read_lock(l) +#define arch_write_lock(l) queued_write_lock(l) +#define arch_read_trylock(l) queued_read_trylock(l) +#define arch_write_trylock(l) queued_write_trylock(l) +#define arch_read_unlock(l) queued_read_unlock(l) +#define arch_write_unlock(l) queued_write_unlock(l) +#define arch_rwlock_is_contended(l) queued_rwlock_is_contended(l) #endif /* __ASM_GENERIC_QRWLOCK_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 066a45319f3bb35f0c2a917c2b536dbf05604abd..148bffa3a994c051cc6ffeccc94c152e0b32aa6f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,28 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS +struct kvm_gfn_range { + struct kvm_memory_slot *slot; + gfn_t start; + gfn_t end; + pte_t pte; + bool may_block; +}; +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +#else +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end, unsigned flags); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ +#endif + enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, @@ -419,9 +442,8 @@ struct kvm_irq_routing_table { #define KVM_PRIVATE_MEM_SLOTS 0 #endif -#ifndef KVM_MEM_SLOTS_NUM -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#endif +#define KVM_MEM_SLOTS_NUM SHRT_MAX +#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS) #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) @@ -445,8 +467,22 @@ struct kvm_memslots { }; struct kvm { +#ifdef KVM_HAVE_MMU_RWLOCK + rwlock_t mmu_lock; +#else spinlock_t mmu_lock; +#endif /* KVM_HAVE_MMU_RWLOCK */ + struct mutex slots_lock; + + /* + * Protects the arch-specific fields of struct kvm_memory_slots in + * use by the VM. To be used under the slots_lock (above) or in a + * kvm->srcu critical section where acquiring the slots_lock would + * lead to deadlock with the synchronize_srcu in + * install_new_memslots. + */ + struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; @@ -496,6 +532,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + unsigned long mmu_notifier_range_start; + unsigned long mmu_notifier_range_end; #endif long tlbs_dirty; struct list_head devices; @@ -748,7 +786,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable); + bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); @@ -1130,7 +1168,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) } static inline unsigned long -__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { /* * The index was checked originally in search_memslots. To avoid @@ -1249,6 +1287,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) return 1; return 0; } + +static inline int mmu_notifier_retry_hva(struct kvm *kvm, + unsigned long mmu_seq, + unsigned long hva) +{ + lockdep_assert_held(&kvm->mmu_lock); + /* + * If mmu_notifier_count is non-zero, then the range maintained by + * kvm_mmu_notifier_invalidate_range_start contains all addresses that + * might be being invalidated. Note that it may include some false + * positives, due to shortcuts when handing concurrent invalidations. + */ + if (unlikely(kvm->mmu_notifier_count) && + hva >= kvm->mmu_notifier_range_start && + hva < kvm->mmu_notifier_range_end) + return 1; + if (kvm->mmu_notifier_seq != mmu_seq) + return 1; + return 0; +} #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 3dcd617e65ae902316399b0717f0dc0e834b355c..7ce9a51ae5c046dbb536fbf36d1b50f9685eab67 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -128,4 +128,11 @@ do { \ 1 : ({ local_irq_restore(flags); 0; }); \ }) +#ifdef arch_rwlock_is_contended +#define rwlock_is_contended(lock) \ + arch_rwlock_is_contended(&(lock)->raw_lock) +#else +#define rwlock_is_contended(lock) ((void)(lock), 0) +#endif /* arch_rwlock_is_contended */ + #endif /* __LINUX_RWLOCK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d39427f8044d3a5a4b57f4ee9e7aefbe82b3c4a5..c04a313a9dbd055a174e6ead75ae50806fe3393a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1995,12 +1995,24 @@ static inline int _cond_resched(void) { return 0; } }) extern int __cond_resched_lock(spinlock_t *lock); +extern int __cond_resched_rwlock_read(rwlock_t *lock); +extern int __cond_resched_rwlock_write(rwlock_t *lock); #define cond_resched_lock(lock) ({ \ ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) +#define cond_resched_rwlock_read(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_read(lock); \ +}) + +#define cond_resched_rwlock_write(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_write(lock); \ +}) + static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) @@ -2024,6 +2036,23 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Check if a rwlock is contended. + * Returns non-zero if there is another task waiting on the rwlock. + * Returns zero if the lock is not contended or the system / underlying + * rwlock implementation does not support contention detection. + * Technically does not depend on CONFIG_PREEMPTION, but a general need + * for low latency. + */ +static inline int rwlock_needbreak(rwlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return rwlock_is_contended(lock); +#else + return 0; +#endif +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 26cfb0fa8e7ee780658d0f9a03c0deb7f22358a7..279d0b7af42b6d1e31ea26ae303aa9afb92ba400 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -255,30 +255,6 @@ TRACE_EVENT(kvm_fpu, TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) ); -TRACE_EVENT(kvm_age_page, - TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref), - TP_ARGS(gfn, level, slot, ref), - - TP_STRUCT__entry( - __field( u64, hva ) - __field( u64, gfn ) - __field( u8, level ) - __field( u8, referenced ) - ), - - TP_fast_assign( - __entry->gfn = gfn; - __entry->level = level; - __entry->hva = ((gfn - slot->base_gfn) << - PAGE_SHIFT) + slot->userspace_addr; - __entry->referenced = ref; - ), - - TP_printk("hva %llx gfn %llx level %u %s", - __entry->hva, __entry->gfn, __entry->level, - __entry->referenced ? "YOUNG" : "OLD") -); - #ifdef CONFIG_KVM_ASYNC_PF DECLARE_EVENT_CLASS(kvm_async_get_page_class, @@ -399,6 +375,72 @@ TRACE_EVENT(kvm_halt_poll_ns, #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(false, vcpu_id, new, old) +TRACE_EVENT(kvm_unmap_hva_range, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_set_spte_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) +); + +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#016lx -- %#016lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 909b0bf22a1ecbfa963e8db1680c3b8eed9cfdfe..b94f3831e963a12576b33f5824251294147b938c 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,7 +12,6 @@ #include #include #include -#include /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8c8c946ee1de223cb5c84a770abbeeefccd7f9fb..9cc4928c5dfa2175e4e238d9a394849a86e0899b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7263,6 +7263,46 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +int __cond_resched_rwlock_read(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_read(lock); + + if (rwlock_needbreak(lock) || resched) { + read_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + read_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_read); + +int __cond_resched_rwlock_write(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_write(lock); + + if (rwlock_needbreak(lock) || resched) { + write_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + write_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_write); + /** * yield - yield the current processor to other threads. * diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 55c69d982c3b5ddcad664197d7aa22989ff49213..f6527704460e9145ae421fe64d9448fe674b7806 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -29,6 +29,7 @@ #define X86_CR4_OSFXSR (1ul << 9) #define X86_CR4_OSXMMEXCPT (1ul << 10) #define X86_CR4_UMIP (1ul << 11) +#define X86_CR4_LA57 (1ul << 12) #define X86_CR4_VMXE (1ul << 13) #define X86_CR4_SMXE (1ul << 14) #define X86_CR4_FSGSBASE (1ul << 16) @@ -38,6 +39,22 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) +/* CPUID.1.ECX */ +#define CPUID_VMX (1ul << 5) +#define CPUID_SMX (1ul << 6) +#define CPUID_PCID (1ul << 17) +#define CPUID_XSAVE (1ul << 26) + +/* CPUID.7.EBX */ +#define CPUID_FSGSBASE (1ul << 0) +#define CPUID_SMEP (1ul << 7) +#define CPUID_SMAP (1ul << 20) + +/* CPUID.7.ECX */ +#define CPUID_UMIP (1ul << 2) +#define CPUID_PKU (1ul << 3) +#define CPUID_LA57 (1ul << 16) + #define UNEXPECTED_VECTOR_PORT 0xfff0u /* General Registers in 64-Bit Mode */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index e78d7e26ba61187f240e8665beb8f3ab749f0770..65eb1079a161ea39c161a474886e28707bba2d71 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -11,10 +11,6 @@ #include #include "processor.h" -#define CPUID_VMX_BIT 5 - -#define CPUID_VMX (1 << 5) - /* * Definitions of Primary Processor-Based VM-Execution Controls. */ diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 9f7656184f31edc23ae21ff72cfe6a36ffa8b617..318be0bf77ab603864fe15463b30ece4660ac142 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -24,16 +24,106 @@ #define VCPU_ID 5 +static void test_cr4_feature_bit(struct kvm_vm *vm, struct kvm_sregs *orig, + uint64_t feature_bit) +{ + struct kvm_sregs sregs; + int rc; + + /* Skip the sub-test, the feature is supported. */ + if (orig->cr4 & feature_bit) + return; + + memcpy(&sregs, orig, sizeof(sregs)); + sregs.cr4 |= feature_bit; + + rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + TEST_ASSERT(rc, "KVM allowed unsupported CR4 bit (0x%lx)", feature_bit); + + /* Sanity check that KVM didn't change anything. */ + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs"); +} + +static uint64_t calc_cr4_feature_bits(struct kvm_vm *vm) +{ + struct kvm_cpuid_entry2 *cpuid_1, *cpuid_7; + uint64_t cr4; + + cpuid_1 = kvm_get_supported_cpuid_entry(1); + cpuid_7 = kvm_get_supported_cpuid_entry(7); + + cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE | + X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE | + X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT; + if (cpuid_7->ecx & CPUID_UMIP) + cr4 |= X86_CR4_UMIP; + if (cpuid_7->ecx & CPUID_LA57) + cr4 |= X86_CR4_LA57; + if (cpuid_1->ecx & CPUID_VMX) + cr4 |= X86_CR4_VMXE; + if (cpuid_1->ecx & CPUID_SMX) + cr4 |= X86_CR4_SMXE; + if (cpuid_7->ebx & CPUID_FSGSBASE) + cr4 |= X86_CR4_FSGSBASE; + if (cpuid_1->ecx & CPUID_PCID) + cr4 |= X86_CR4_PCIDE; + if (cpuid_1->ecx & CPUID_XSAVE) + cr4 |= X86_CR4_OSXSAVE; + if (cpuid_7->ebx & CPUID_SMEP) + cr4 |= X86_CR4_SMEP; + if (cpuid_7->ebx & CPUID_SMAP) + cr4 |= X86_CR4_SMAP; + if (cpuid_7->ecx & CPUID_PKU) + cr4 |= X86_CR4_PKE; + + return cr4; +} + int main(int argc, char *argv[]) { struct kvm_sregs sregs; struct kvm_vm *vm; + uint64_t cr4; int rc; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - /* Create VM */ + /* + * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and + * use it to verify all supported CR4 bits can be set prior to defining + * the vCPU model, i.e. without doing KVM_SET_CPUID2. + */ + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + + sregs.cr4 |= calc_cr4_feature_bits(vm); + cr4 = sregs.cr4; + + rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)", + sregs.cr4, cr4); + + /* Verify all unsupported features are rejected by KVM. */ + test_cr4_feature_bit(vm, &sregs, X86_CR4_UMIP); + test_cr4_feature_bit(vm, &sregs, X86_CR4_LA57); + test_cr4_feature_bit(vm, &sregs, X86_CR4_VMXE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_SMXE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_FSGSBASE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_PCIDE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_OSXSAVE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_SMEP); + test_cr4_feature_bit(vm, &sregs, X86_CR4_SMAP); + test_cr4_feature_bit(vm, &sregs, X86_CR4_PKE); + kvm_vm_free(vm); + + /* Create a "real" VM and verify APIC_BASE can be set. */ vm = vm_create_default(VCPU_ID, 0, NULL); vcpu_sregs_get(vm, VCPU_ID, &sregs); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 62d52df216b380e3229795bee751ada42ed92464..4c63787c7b08cbb102addf8b4f7f1cc820f8f1c6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -58,6 +58,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "mmu_lock.h" #include "vfio.h" #define CREATE_TRACE_POINTS @@ -508,48 +509,257 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + +typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); + +typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, + unsigned long end); + +typedef void (*on_unlock_fn_t)(struct kvm *kvm); + +struct kvm_hva_range { + unsigned long start; + unsigned long end; + pte_t pte; + hva_handler_t handler; + on_lock_fn_t on_lock; + on_unlock_fn_t on_unlock; + bool flush_on_ret; + bool may_block; +}; + +/* + * Use a dedicated stub instead of NULL to indicate that there is no callback + * function/handler. The compiler technically can't guarantee that a real + * function will have a non-zero address, and so it will generate code to + * check for !NULL, whereas comparing against a stub will be elided at compile + * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). + */ +static void kvm_null_fn(void) +{ + +} +#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) + +static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_hva_range *range) +{ + bool ret = false, locked = false; + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + int i, idx; + + /* A null handler is allowed if and only if on_lock() is provided. */ + if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && + IS_KVM_NULL_FN(range->handler))) + return 0; + + idx = srcu_read_lock(&kvm->srcu); + + /* The on_lock() path does not yet support lock elision. */ + if (!IS_KVM_NULL_FN(range->on_lock)) { + locked = true; + KVM_MMU_LOCK(kvm); + + range->on_lock(kvm, range->start, range->end); + + if (IS_KVM_NULL_FN(range->handler)) + goto out_unlock; + } + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + unsigned long hva_start, hva_end; + + hva_start = max(range->start, slot->userspace_addr); + hva_end = min(range->end, slot->userspace_addr + + (slot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + /* + * To optimize for the likely case where the address + * range is covered by zero or one memslots, don't + * bother making these conditional (to avoid writes on + * the second or later invocation of the handler). + */ + gfn_range.pte = range->pte; + gfn_range.may_block = range->may_block; + + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_range.start = hva_to_gfn_memslot(hva_start, slot); + gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); + gfn_range.slot = slot; + + if (!locked) { + locked = true; + KVM_MMU_LOCK(kvm); + } + ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) + kvm_flush_remote_tlbs(kvm); + +out_unlock: + if (locked) { + KVM_MMU_UNLOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_unlock)) + range->on_unlock(kvm); + + } + + srcu_read_unlock(&kvm->srcu, idx); + + /* The notifiers are averse to booleans. :-( */ + return (int)ret; +} + +static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + pte_t pte, + hva_handler_t handler) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range range = { + .start = start, + .end = end, + .pte = pte, + .handler = handler, + .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, + .flush_on_ret = true, + .may_block = false, + }; + + return __kvm_handle_hva_range(kvm, &range); +} + +static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + hva_handler_t handler) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range range = { + .start = start, + .end = end, + .pte = __pte(0), + .handler = handler, + .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, + .flush_on_ret = false, + .may_block = false, + }; + + return __kvm_handle_hva_range(kvm, &range); +} + +static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + /* + * The count increase must become visible at unlock time as no + * spte can be established without taking the mmu_lock and + * count is also read inside the mmu_lock critical section. + */ + kvm->mmu_notifier_count++; + if (likely(kvm->mmu_notifier_count == 1)) { + kvm->mmu_notifier_range_start = start; + kvm->mmu_notifier_range_end = end; + } else { + /* + * Fully tracking multiple concurrent ranges has dimishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_notifier_range_start = + min(kvm->mmu_notifier_range_start, start); + kvm->mmu_notifier_range_end = + max(kvm->mmu_notifier_range_end, end); + } +} +#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, pte_t pte) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS int idx; + struct kvm *kvm = mmu_notifier_to_kvm(mn); +#endif + trace_kvm_set_spte_hva(address); + +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); +#else idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + + KVM_MMU_LOCK(kvm); + kvm->mmu_notifier_seq++; if (kvm_set_spte_hva(kvm, address, pte)) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); +#endif } static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + const struct kvm_hva_range hva_range = { + .start = range->start, + .end = range->end, + .pte = __pte(0), + .handler = kvm_unmap_gfn_range, + .on_lock = kvm_inc_notifier_count, + .on_unlock = kvm_arch_guest_memory_reclaimed, + .flush_on_ret = true, + .may_block = mmu_notifier_range_blockable(range), + }; +#else int need_tlb_flush = 0, idx; +#endif + + trace_kvm_unmap_hva_range(range->start, range->end); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + __kvm_handle_hva_range(kvm, &hva_range); +#else idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - /* - * The count increase must become visible at unlock time as no - * spte can be established without taking the mmu_lock and - * count is also read inside the mmu_lock critical section. - */ - kvm->mmu_notifier_count++; + KVM_MMU_LOCK(kvm); need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush || kvm->tlbs_dirty) kvm_flush_remote_tlbs(kvm); - - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); +#endif + kvm_arch_guest_memory_reclaimed(kvm); +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS srcu_read_unlock(&kvm->srcu, idx); +#endif return 0; } @@ -559,7 +769,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have @@ -573,7 +783,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, * in conjunction with the smp_rmb in mmu_notifier_retry(). */ kvm->mmu_notifier_count--; - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); BUG_ON(kvm->mmu_notifier_count < 0); } @@ -583,20 +793,27 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; +#endif + trace_kvm_age_hva(start, end); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn); +#else idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; +#endif } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -604,11 +821,13 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; +#endif + + trace_kvm_age_hva(start, end); - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is @@ -622,27 +841,41 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); +#else + idx = srcu_read_lock(&kvm->srcu); + KVM_MMU_LOCK(kvm); young = kvm_age_hva(kvm, start, end); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; +#endif } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; +#endif + trace_kvm_test_age_hva(address); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + return kvm_handle_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); +#else idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); young = kvm_test_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; +#endif } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, @@ -818,13 +1051,14 @@ static struct kvm *kvm_create_vm(unsigned long type) if (!kvm) return ERR_PTR(-ENOMEM); - spin_lock_init(&kvm->mmu_lock); + KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); + mutex_init(&kvm->slots_arch_lock); INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1211,6 +1445,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; rcu_assign_pointer(kvm->memslots[as_id], slots); + + /* + * Acquired in kvm_set_memslot. Must be released before synchronize + * SRCU below in order to avoid deadlock with another thread + * acquiring the slots_arch_lock in an srcu critical section. + */ + mutex_unlock(&kvm->slots_arch_lock); + synchronize_srcu_expedited(&kvm->srcu); /* @@ -1237,6 +1479,18 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, return old_memslots; } +static size_t kvm_memslots_size(int slots) +{ + return sizeof(struct kvm_memslots) + + (sizeof(struct kvm_memory_slot) * slots); +} + +static void kvm_copy_memslots(struct kvm_memslots *to, + struct kvm_memslots *from) +{ + memcpy(to, from, kvm_memslots_size(from->used_slots)); +} + /* * Note, at a minimum, the current number of used slots must be allocated, even * when deleting a memslot, as we need a complete duplicate of the memslots for @@ -1246,19 +1500,16 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, enum kvm_mr_change change) { struct kvm_memslots *slots; - size_t old_size, new_size; - - old_size = sizeof(struct kvm_memslots) + - (sizeof(struct kvm_memory_slot) * old->used_slots); + size_t new_size; if (change == KVM_MR_CREATE) - new_size = old_size + sizeof(struct kvm_memory_slot); + new_size = kvm_memslots_size(old->used_slots + 1); else - new_size = old_size; + new_size = kvm_memslots_size(old->used_slots); slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); if (likely(slots)) - memcpy(slots, old, old_size); + kvm_copy_memslots(slots, old); return slots; } @@ -1273,9 +1524,27 @@ static int kvm_set_memslot(struct kvm *kvm, struct kvm_memslots *slots; int r; + /* + * Released in install_new_memslots. + * + * Must be held from before the current memslots are copied until + * after the new memslots are installed with rcu_assign_pointer, + * then released before the synchronize srcu in install_new_memslots. + * + * When modifying memslots outside of the slots_lock, must be held + * before reading the pointer to the current memslots until after all + * changes to those memslots are complete. + * + * These rules ensure that installing new memslots does not lose + * changes made to the previous memslots. + */ + mutex_lock(&kvm->slots_arch_lock); + slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); - if (!slots) + if (!slots) { + mutex_unlock(&kvm->slots_arch_lock); return -ENOMEM; + } if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { /* @@ -1286,10 +1555,9 @@ static int kvm_set_memslot(struct kvm *kvm, slot->flags |= KVM_MEMSLOT_INVALID; /* - * We can re-use the old memslots, the only difference from the - * newly installed memslots is the invalid flag, which will get - * dropped by update_memslots anyway. We'll also revert to the - * old memslots if preparing the new memory region fails. + * We can re-use the memory from the old memslots. + * It will be overwritten with a copy of the new memslots + * after reacquiring the slots_arch_lock below. */ slots = install_new_memslots(kvm, as_id, slots); @@ -1302,6 +1570,17 @@ static int kvm_set_memslot(struct kvm *kvm, */ kvm_arch_flush_shadow_memslot(kvm, slot); kvm_arch_guest_memory_reclaimed(kvm); + + /* Released in install_new_memslots. */ + mutex_lock(&kvm->slots_arch_lock); + + /* + * The arch-specific fields of the memslots could have changed + * between releasing the slots_arch_lock in + * install_new_memslots and here, so get a fresh copy of the + * slots. + */ + kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id)); } r = kvm_arch_prepare_memory_region(kvm, new, mem, change); @@ -1317,8 +1596,13 @@ static int kvm_set_memslot(struct kvm *kvm, return 0; out_slots: - if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { + slot = id_to_memslot(slots, old->id); + slot->flags &= ~KVM_MEMSLOT_INVALID; slots = install_new_memslots(kvm, as_id, slots); + } else { + mutex_unlock(&kvm->slots_arch_lock); + } kvfree(slots); return r; } @@ -1606,7 +1890,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1622,7 +1906,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); } if (flush) @@ -1713,7 +1997,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (offset = log->first_page, i = offset / BITS_PER_LONG, n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { @@ -1736,7 +2020,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, offset, mask); } } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -2111,10 +2395,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable) + bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + if (hva) + *hva = addr; + if (addr == KVM_HVA_ERR_RO_BAD) { if (writable) *writable = false; @@ -2142,19 +2429,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, - write_fault, writable); + write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h new file mode 100644 index 0000000000000000000000000000000000000000..9e1308f9734c38ea1815c8a363abcd34d882b2f8 --- /dev/null +++ b/virt/kvm/mmu_lock.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_MMU_LOCK_H +#define KVM_MMU_LOCK_H 1 + +/* + * Architectures can choose whether to use an rwlock or spinlock + * for the mmu_lock. These macros, for use in common code + * only, avoids using #ifdefs in places that must deal with + * multiple architectures. + */ + +#ifdef KVM_HAVE_MMU_RWLOCK +#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) +#else +#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#endif /* KVM_HAVE_MMU_RWLOCK */ + +#endif