From c35214850a1151e0ca935d830f0f6b50e52852aa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Jan 2021 16:30:03 -0800 Subject: [PATCH 001/158] KVM: x86/mmu: Use boolean returns for (S)PTE accessors mainline inclusion from mainline-v5.12-rc1 commit 15e6a7e5324cc04a67891fc369ea834bbb7e7b42 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15e6a7e5324cc04a67891fc369ea834bbb7e7b42 ---------------------------------------------------------------------- Return a 'bool' instead of an 'int' for various PTE accessors that are boolean in nature, e.g. is_shadow_present_pte(). Returning an int is goofy and potentially dangerous, e.g. if a flag being checked is moved into the upper 32 bits of a SPTE, then the compiler may silently squash the entire check since casting to an int is guaranteed to yield a return value of '0'. Opportunistically refactor is_last_spte() so that it naturally returns a bool value instead of letting it implicitly cast 0/1 to false/true. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210123003003.3137525-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/mmu/spte.h | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 581925e476d6..f61e18dad2f3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -145,7 +145,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * * TODO: introduce APIs to split these two cases. */ -static inline int is_writable_pte(unsigned long pte) +static inline bool is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 667f207d3d09..f07aee04e903 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -179,23 +179,19 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline int is_shadow_present_pte(u64 pte) +static inline bool is_shadow_present_pte(u64 pte) { return (pte != 0) && !is_mmio_spte(pte); } -static inline int is_large_pte(u64 pte) +static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; } -static inline int is_last_spte(u64 pte, int level) +static inline bool is_last_spte(u64 pte, int level) { - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; + return (level == PG_LEVEL_4K) || is_large_pte(pte); } static inline bool is_executable_pte(u64 spte) -- Gitee From 38a25b1115c5dc3d4ff8829fae4837d0169b279c Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 11:10:32 +0800 Subject: [PATCH 002/158] kvm: x86/mmu: Add existing trace points to TDP MMU mainline inclusion from mainline-v5.11-rc1 commit 33dd3574f5fef57c2c6caccf98925d63aa2a8d09 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=33dd3574f5fef57c2c6caccf98925d63aa2a8d09 ---------------------------------------------------------------------- The TDP MMU was initially implemented without some of the usual tracepoints found in mmu.c. Correct this discrepancy by adding the missing trace points to the TDP MMU. Tested: ran the demand paging selftest on an Intel Skylake machine with all the trace points used by the TDP MMU enabled and observed them firing with expected values. This patch can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/3812 Signed-off-by: Ben Gardon Message-Id: <20201027175944.1183301-1-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 073514bbb5f7..65c56ca66774 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,6 +7,8 @@ #include "tdp_mmu.h" #include "spte.h" +#include + #ifdef CONFIG_X86_64 static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); @@ -149,6 +151,8 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, sp->gfn = gfn; sp->tdp_mmu_page = true; + trace_kvm_mmu_get_page(sp, true); + return sp; } @@ -319,6 +323,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, pt = spte_to_child_pt(old_spte, level); sp = sptep_to_sp(pt); + trace_kvm_mmu_prepare_zap_page(sp); + list_del(&sp->link); if (sp->lpage_disallowed) @@ -533,11 +539,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (unlikely(is_noslot_pfn(pfn))) { new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); - } else + } else { make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, &new_spte); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + } if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; @@ -743,6 +751,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); young = 1; + + trace_kvm_age_page(iter.gfn, iter.level, slot, young); } return young; -- Gitee From ce0dc002a29f26f4a232d1faade3030e89691c95 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 11:11:12 +0800 Subject: [PATCH 003/158] kvm: x86/mmu: Add TDP MMU SPTE changed trace point mainline inclusion from mainline-v5.11-rc1 commit b9a98c3437e353b269ebf3567acc5c3dc757c7a5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b9a98c3437e353b269ebf3567acc5c3dc757c7a5 ---------------------------------------------------------------------- Add an extremely verbose trace point to the TDP MMU to log all SPTE changes, regardless of callstack / motivation. This is useful when a complete picture of the paging structure is needed or a change cannot be explained with the other, existing trace points. Tested: ran the demand paging selftest on an Intel Skylake machine with all the trace points used by the TDP MMU enabled and observed them firing with expected values. This patch can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/3813 Signed-off-by: Ben Gardon Message-Id: <20201027175944.1183301-2-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmutrace.h | 29 +++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.c | 2 ++ 2 files changed, 31 insertions(+) diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 213699b27b44..e798489b56b5 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -381,6 +381,35 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_tdp_mmu_spte_changed, + TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte), + TP_ARGS(as_id, gfn, level, old_spte, new_spte), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, old_spte) + __field(u64, new_spte) + /* Level cannot be larger than 5 on x86, so it fits in a u8. */ + __field(u8, level) + /* as_id can only be 0 or 1 x86, so it fits in a u8. */ + __field(u8, as_id) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->old_spte = old_spte; + __entry->new_spte = new_spte; + __entry->level = level; + __entry->as_id = as_id; + ), + + TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx", + __entry->as_id, __entry->gfn, __entry->level, + __entry->old_spte, __entry->new_spte + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 65c56ca66774..2a1320a6d9a9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -289,6 +289,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (old_spte == new_spte) return; + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ -- Gitee From 76e1be8366f28d519587591917943acbf8fa383a Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 11:11:37 +0800 Subject: [PATCH 004/158] KVM: x86/mmu: Add comment on __tdp_mmu_set_spte mainline inclusion from mainline-v5.12-rc1 commit fe43fa2f407b9d513f7bcf18142e14e1bf1508d6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fe43fa2f407b9d513f7bcf18142e14e1bf1508d6 ---------------------------------------------------------------------- __tdp_mmu_set_spte is a very important function in the TDP MMU which already accepts several arguments and will take more in future commits. To offset this complexity, add a comment to the function describing each of the arguemnts. No functional change intended. Reviewed-by: Peter Feiner Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-3-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 2a1320a6d9a9..323ac6e14715 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -357,6 +357,22 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, new_spte, level); } +/* + * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * @record_acc_track: Notify the MM subsystem of changes to the accessed state + * of the page. Should be set unless handling an MMU + * notifier for access tracking. Leaving record_acc_track + * unset in that case prevents page accesses from being + * double counted. + * @record_dirty_log: Record the page as dirty in the dirty bitmap if + * appropriate for the change being made. Should be set + * unless performing certain dirty logging operations. + * Leaving record_dirty_log unset in that case prevents page + * writes from being double counted. + */ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) -- Gitee From 24e85ab62ebade77dceefbdaef387482cfa4e6d3 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 11:12:20 +0800 Subject: [PATCH 005/158] KVM: x86/mmu: Add lockdep when setting a TDP MMU SPTE mainline inclusion from mainline-v5.12-rc1 commit 3a9a4aa5657471a02ffb7f9b7f3b7a468b3f257b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3a9a4aa5657471a02ffb7f9b7f3b7a468b3f257b ---------------------------------------------------------------------- Add lockdep to __tdp_mmu_set_spte to ensure that SPTEs are only modified under the MMU lock. No functional change intended. Reviewed-by: Peter Feiner Reviewed-by: Sean Christopherson Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-4-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 323ac6e14715..be9997bc7486 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -381,6 +381,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *root = sptep_to_sp(root_pt); int as_id = kvm_mmu_page_as_id(root); + lockdep_assert_held(&kvm->mmu_lock); + WRITE_ONCE(*iter->sptep, new_spte); __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, -- Gitee From 51150c84b20af4ed986164e14a43880141f627d7 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 11:18:16 +0800 Subject: [PATCH 006/158] KVM: x86/mmu: Don't redundantly clear TDP MMU pt memory mainline inclusion from mainline-v5.12-rc1 commit 734e45b329d626d2c14e2bcf8be3d069a33c3316 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=734e45b329d626d2c14e2bcf8be3d069a33c3316 ---------------------------------------------------------------------- The KVM MMU caches already guarantee that shadow page table memory will be zeroed, so there is no reason to re-zero the page in the TDP MMU page fault handler. No functional change intended. Reviewed-by: Peter Feiner Reviewed-by: Sean Christopherson Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-5-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index be9997bc7486..1d14498a9001 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -657,7 +657,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); child_pt = sp->spt; - clear_page(child_pt); new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); -- Gitee From fe26b7fa48d691574fe102f26396d4d04276080f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 11:20:45 +0800 Subject: [PATCH 007/158] KVM: x86/mmu: Factor out handling of removed page tables mainline inclusion from mainline-v5.12-rc1 commit a066e61f13cf4b17d043ad8bea0cdde2b1e5ee49 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a066e61f13cf4b17d043ad8bea0cdde2b1e5ee49 ---------------------------------------------------------------------- Factor out the code to handle a disconnected subtree of the TDP paging structure from the code to handle the change to an individual SPTE. Future commits will build on this to allow asynchronous page freeing. No functional change intended. Reviewed-by: Peter Feiner Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-6-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 71 ++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1d14498a9001..b4774c4cab5f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -234,6 +234,45 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } } +/** + * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * + * @kvm: kvm instance + * @pt: the page removed from the paging structure + * + * Given a page table that has been removed from the TDP paging structure, + * iterates through the page table to clear SPTEs and free child page tables. + */ +static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) +{ + struct kvm_mmu_page *sp = sptep_to_sp(pt); + int level = sp->role.level; + gfn_t gfn = sp->gfn; + u64 old_child_spte; + int i; + + trace_kvm_mmu_prepare_zap_page(sp); + + list_del(&sp->link); + + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); + WRITE_ONCE(*(pt + i), 0); + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), + old_child_spte, 0, level - 1); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + free_page((unsigned long)pt); + kmem_cache_free(mmu_page_header_cache, sp); +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -254,10 +293,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - u64 *pt; - struct kvm_mmu_page *sp; - u64 old_child_spte; - int i; WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -321,31 +356,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Recursively handle child PTs if the change removed a subtree from * the paging structure. */ - if (was_present && !was_leaf && (pfn_changed || !is_present)) { - pt = spte_to_child_pt(old_spte, level); - sp = sptep_to_sp(pt); - - trace_kvm_mmu_prepare_zap_page(sp); - - list_del(&sp->link); - - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); - - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - old_child_spte = READ_ONCE(*(pt + i)); - WRITE_ONCE(*(pt + i), 0); - handle_changed_spte(kvm, as_id, - gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), - old_child_spte, 0, level - 1); - } - - kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); - - free_page((unsigned long)pt); - kmem_cache_free(mmu_page_header_cache, sp); - } + if (was_present && !was_leaf && (pfn_changed || !is_present)) + handle_removed_tdp_mmu_page(kvm, + spte_to_child_pt(old_spte, level)); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, -- Gitee From ad6787df5977ded5e856f502fa19b586c631cdec Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:02:01 +0800 Subject: [PATCH 008/158] KVM: x86/mmu: Skip no-op changes in TDP MMU functions mainline inclusion from mainline-v5.12-rc1 commit 0f99ee2c7a58fc281c084d3acc0f0013bec7ec5a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0f99ee2c7a58fc281c084d3acc0f0013bec7ec5a ---------------------------------------------------------------------- Skip setting SPTEs if no change is expected. Reviewed-by: Peter Feiner Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-16-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index b4774c4cab5f..a94732d78cd7 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -891,7 +891,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + !is_last_spte(iter.old_spte, iter.level) || + !(iter.old_spte & PT_WRITABLE_MASK)) continue; new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -1071,7 +1072,8 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) continue; - if (!is_shadow_present_pte(iter.old_spte)) + if (!is_shadow_present_pte(iter.old_spte) || + iter.old_spte & shadow_dirty_mask) continue; new_spte = iter.old_spte | shadow_dirty_mask; -- Gitee From db5650715e3352a2b90bf76ddfffc14bf4d570d5 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:10:38 +0800 Subject: [PATCH 009/158] KVM: x86/mmu: Clear dirtied pages mask bit before early break mainline inclusion from mainline-v5.12-rc1 commit f1b3b06a058bb5c636ffad0afae138fe30775881 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f1b3b06a058bb5c636ffad0afae138fe30775881 ---------------------------------------------------------------------- In clear_dirty_pt_masked, the loop is intended to exit early after processing each of the GFNs with corresponding bits set in mask. This does not work as intended if another thread has already cleared the dirty bit or writable bit on the SPTE. In that case, the loop would proceed to the next iteration early and the bit in mask would not be cleared. As a result the loop could not exit early and would proceed uselessly. Move the unsetting of the mask bit before the check for a no-op SPTE change. Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") Suggested-by: Sean Christopherson Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-17-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a94732d78cd7..cf5b29ec7a30 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1013,6 +1013,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, !(mask & (1UL << (iter.gfn - gfn)))) continue; + mask &= ~(1UL << (iter.gfn - gfn)); + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -1026,8 +1028,6 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); - - mask &= ~(1UL << (iter.gfn - gfn)); } } -- Gitee From 2c9b1105f3f508c26ba3d8cb9b1fb2ea2690c7b4 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:12:23 +0800 Subject: [PATCH 010/158] KVM: x86/mmu: Protect TDP MMU page table memory with RCU mainline inclusion from mainline-v5.12-rc1 commit 7cca2d0b7e7d9f3cd740d41afdc00051c9b508a0 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7cca2d0b7e7d9f3cd740d41afdc00051c9b508a0 ---------------------------------------------------------------------- In order to enable concurrent modifications to the paging structures in the TDP MMU, threads must be able to safely remove pages of page table memory while other threads are traversing the same memory. To ensure threads do not access PT memory after it is freed, protect PT memory with RCU. Protecting concurrent accesses to page table memory from use-after-free bugs could also have been acomplished using walk_shadow_page_lockless_begin/end() and READING_SHADOW_PAGE_TABLES, coupling with the barriers in a TLB flush. The use of RCU for this case has several distinct advantages over that approach. 1. Disabling interrupts for long running operations is not desirable. Future commits will allow operations besides page faults to operate without the exclusive protection of the MMU lock and those operations are too long to disable iterrupts for their duration. 2. The use of RCU here avoids long blocking / spinning operations in perfromance critical paths. By freeing memory with an asynchronous RCU API we avoid the longer wait times TLB flushes experience when overlapping with a thread in walk_shadow_page_lockless_begin/end(). 3. RCU provides a separation of concerns when removing memory from the paging structure. Because the RCU callback to free memory can be scheduled immediately after a TLB flush, there's no need for the thread to manually free a queue of pages later, as commit_zap_pages does. Fixes: 95fb5b0258b7 ("kvm: x86/mmu: Support MMIO in the TDP MMU") Reviewed-by: Peter Feiner Suggested-by: Sean Christopherson Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-18-bgardon@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/tdp_mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 3 ++ arch/x86/kvm/mmu/tdp_iter.c | 16 +++--- arch/x86/kvm/mmu/tdp_iter.h | 10 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 94 +++++++++++++++++++++++++++++---- 4 files changed, 102 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bfc6389edc28..7f599cc64178 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -57,6 +57,9 @@ struct kvm_mmu_page { atomic_t write_flooding_count; bool tdp_mmu_page; + + /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + struct rcu_head rcu_head; }; extern struct kmem_cache *mmu_page_header_cache; diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 1a09d212186b..e5f148106e20 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); } static gfn_t round_gfn_for_level(gfn_t gfn, int level) @@ -35,7 +35,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, iter->root_level = root_level; iter->min_level = min_level; iter->level = root_level; - iter->pt_path[iter->level - 1] = root_pt; + iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); @@ -48,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, * address of the child page table referenced by the SPTE. Returns null if * there is no such entry. */ -u64 *spte_to_child_pt(u64 spte, int level) +tdp_ptep_t spte_to_child_pt(u64 spte, int level) { /* * There's no child entry if this entry isn't present or is a @@ -57,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level) if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) return NULL; - return __va(spte_to_pfn(spte) << PAGE_SHIFT); + return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); } /* @@ -66,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level) */ static bool try_step_down(struct tdp_iter *iter) { - u64 *child_pt; + tdp_ptep_t child_pt; if (iter->level == iter->min_level) return false; @@ -75,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter) * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) @@ -109,7 +109,7 @@ static bool try_step_side(struct tdp_iter *iter) iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); iter->next_last_level_gfn = iter->gfn; iter->sptep++; - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); return true; } @@ -159,7 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -u64 *tdp_iter_root_pt(struct tdp_iter *iter) +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) { return iter->pt_path[iter->root_level - 1]; } diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index d480c540ee27..4cc177d75c4a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -7,6 +7,8 @@ #include "mmu.h" +typedef u64 __rcu *tdp_ptep_t; + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ @@ -23,9 +25,9 @@ struct tdp_iter { */ gfn_t yielded_gfn; /* Pointers to the page tables traversed to reach the current SPTE */ - u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ - u64 *sptep; + tdp_ptep_t sptep; /* The lowest GFN mapped by the current SPTE */ gfn_t gfn; /* The level of the root page given to the iterator */ @@ -55,11 +57,11 @@ struct tdp_iter { #define for_each_tdp_pte(iter, root, root_level, start, end) \ for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) -u64 *spte_to_child_pt(u64 pte, int level); +tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -u64 *tdp_iter_root_pt(struct tdp_iter *iter); +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cf5b29ec7a30..10543f297718 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -42,6 +42,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) return; WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* + * Ensure that all the outstanding RCU callbacks to free shadow pages + * can run before the VM is torn down. + */ + rcu_barrier(); } static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) @@ -196,6 +202,28 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) return __pa(root->spt); } +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) +{ + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); +} + +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) +{ + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); + + tdp_mmu_free_sp(sp); +} + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level); @@ -269,8 +297,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) kvm_flush_remote_tlbs_with_address(kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - free_page((unsigned long)pt); - kmem_cache_free(mmu_page_header_cache, sp); + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } /** @@ -390,13 +417,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - u64 *root_pt = tdp_iter_root_pt(iter); + tdp_ptep_t root_pt = tdp_iter_root_pt(iter); struct kvm_mmu_page *root = sptep_to_sp(root_pt); int as_id = kvm_mmu_page_as_id(root); lockdep_assert_held(&kvm->mmu_lock); - WRITE_ONCE(*iter->sptep, new_spte); + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, iter->level); @@ -466,10 +493,13 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, return false; if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + rcu_read_unlock(); + if (flush) kvm_flush_remote_tlbs(kvm); cond_resched_lock(&kvm->mmu_lock); + rcu_read_lock(); WARN_ON(iter->gfn > iter->next_last_level_gfn); @@ -501,6 +531,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { @@ -525,6 +557,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, flush = true; } + rcu_read_unlock(); return flush; } @@ -571,13 +604,15 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (unlikely(is_noslot_pfn(pfn))) { new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); - trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, + new_spte); } else { make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, &new_spte); - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); } if (new_spte == iter->old_spte) @@ -600,7 +635,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (unlikely(is_mmio_spte(new_spte))) ret = RET_PF_EMULATE; - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; @@ -638,6 +674,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); + + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { if (nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(iter.old_spte, gfn, @@ -663,7 +702,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, * because the new value informs the !present * path below. */ - iter.old_spte = READ_ONCE(*iter.sptep); + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); } if (!is_shadow_present_pte(iter.old_spte)) { @@ -681,11 +720,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } } - if (WARN_ON(iter.level != level)) + if (WARN_ON(iter.level != level)) { + rcu_read_unlock(); return RET_PF_RETRY; + } ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, pfn, prefault); + rcu_read_unlock(); return ret; } @@ -756,6 +798,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, int young = 0; u64 new_spte = 0; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { /* * If we have a non-accessed entry we don't need to change the @@ -787,6 +831,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, trace_kvm_age_page(iter.gfn, iter.level, slot, young); } + rcu_read_unlock(); + return young; } @@ -832,6 +878,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, u64 new_spte; int need_flush = 0; + rcu_read_lock(); + WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); @@ -860,6 +908,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, if (need_flush) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + rcu_read_unlock(); + return 0; } @@ -883,6 +933,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); for_each_tdp_pte_min_level(iter, root->spt, root->role.level, @@ -900,6 +952,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; } + + rcu_read_unlock(); return spte_set; } @@ -941,6 +995,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) continue; @@ -963,6 +1019,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; } + + rcu_read_unlock(); return spte_set; } @@ -1004,6 +1062,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; u64 new_spte; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) @@ -1029,6 +1089,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); } + + rcu_read_unlock(); } /* @@ -1068,6 +1130,8 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) continue; @@ -1082,6 +1146,7 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); return spte_set; } @@ -1119,6 +1184,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, kvm_pfn_t pfn; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { spte_set = false; @@ -1140,6 +1207,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, spte_set = true; } + rcu_read_unlock(); if (spte_set) kvm_flush_remote_tlbs(kvm); } @@ -1176,6 +1244,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { new_spte = iter.old_spte & ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); @@ -1187,6 +1257,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); + return spte_set; } @@ -1227,10 +1299,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf - 1] = iter.old_spte; } + rcu_read_unlock(); + return leaf; } -- Gitee From d3af3e66b52066a47c3970b03b192e71592f61fa Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:36:04 +0800 Subject: [PATCH 011/158] locking/rwlocks: Add contention detection for rwlocks mainline inclusion from mainline-v5.12-rc1 commit 26128cb6c7e6731fe644c687af97733adfdb5ee9 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=26128cb6c7e6731fe644c687af97733adfdb5ee9 ---------------------------------------------------------------------- rwlocks do not currently have any facility to detect contention like spinlocks do. In order to allow users of rwlocks to better manage latency, add contention detection for queued rwlocks. CC: Ingo Molnar CC: Will Deacon Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Acked-by: Waiman Long Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-7-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- include/asm-generic/qrwlock.h | 24 ++++++++++++++++++------ include/linux/rwlock.h | 7 +++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 3aefde23dcea..352a9ebb9f61 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -14,6 +14,7 @@ #include #include +#include /* * Writer states & reader shift and bias. @@ -116,15 +117,26 @@ static inline void queued_write_unlock(struct qrwlock *lock) smp_store_release(&lock->wlocked, 0); } +/** + * queued_rwlock_is_contended - check if the lock is contended + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock contended, 0 otherwise + */ +static inline int queued_rwlock_is_contended(struct qrwlock *lock) +{ + return arch_spin_is_locked(&lock->wait_lock); +} + /* * Remapping rwlock architecture specific functions to the corresponding * queue rwlock functions. */ -#define arch_read_lock(l) queued_read_lock(l) -#define arch_write_lock(l) queued_write_lock(l) -#define arch_read_trylock(l) queued_read_trylock(l) -#define arch_write_trylock(l) queued_write_trylock(l) -#define arch_read_unlock(l) queued_read_unlock(l) -#define arch_write_unlock(l) queued_write_unlock(l) +#define arch_read_lock(l) queued_read_lock(l) +#define arch_write_lock(l) queued_write_lock(l) +#define arch_read_trylock(l) queued_read_trylock(l) +#define arch_write_trylock(l) queued_write_trylock(l) +#define arch_read_unlock(l) queued_read_unlock(l) +#define arch_write_unlock(l) queued_write_unlock(l) +#define arch_rwlock_is_contended(l) queued_rwlock_is_contended(l) #endif /* __ASM_GENERIC_QRWLOCK_H */ diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 3dcd617e65ae..7ce9a51ae5c0 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -128,4 +128,11 @@ do { \ 1 : ({ local_irq_restore(flags); 0; }); \ }) +#ifdef arch_rwlock_is_contended +#define rwlock_is_contended(lock) \ + arch_rwlock_is_contended(&(lock)->raw_lock) +#else +#define rwlock_is_contended(lock) ((void)(lock), 0) +#endif /* arch_rwlock_is_contended */ + #endif /* __LINUX_RWLOCK_H */ -- Gitee From 2bb655c5d3c3c76052774e05260e55f9abd88c81 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 10 Feb 2021 13:16:31 -0500 Subject: [PATCH 012/158] locking/arch: Move qrwlock.h include after qspinlock.h mainline inclusion from mainline-v5.12-rc1 commit d8d0da4eee5c4e86ea08abde6975848376b4ac13 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8d0da4eee5c4e86ea08abde6975848376b4ac13 ---------------------------------------------------------------------- include/asm-generic/qrwlock.h was trying to get arch_spin_is_locked via asm-generic/qspinlock.h. However, this does not work because architectures might be using queued rwlocks but not queued spinlocks (csky), or because they might be defining their own queued_* macros before including asm/qspinlock.h. To fix this, ensure that asm/spinlock.h always includes qrwlock.h after defining arch_spin_is_locked (either directly for csky, or via asm/qspinlock.h for other architectures). The only inclusion elsewhere is in kernel/locking/qrwlock.c. That one is really unnecessary because the file is only compiled in SMP configurations (config QUEUED_RWLOCKS depends on SMP) and in that case linux/spinlock.h already includes asm/qrwlock.h if needed, via asm/spinlock.h. Reported-by: Guenter Roeck Signed-off-by: Waiman Long Fixes: 26128cb6c7e6 ("locking/rwlocks: Add contention detection for rwlocks") Tested-by: Guenter Roeck Reviewed-by: Ben Gardon [Add arch/sparc and kernel/locking parts per discussion with Waiman. - Paolo] Signed-off-by: Paolo Bonzini conflict: arch/arm64/include/asm/spinlock.h Signed-off-by: Yu Zhang --- arch/arm64/include/asm/spinlock.h | 2 +- arch/mips/include/asm/spinlock.h | 2 +- arch/sparc/include/asm/spinlock_64.h | 2 +- arch/xtensa/include/asm/spinlock.h | 2 +- include/asm-generic/qrwlock.h | 3 ++- kernel/locking/qrwlock.c | 1 - 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index ab3ab398fb96..cf827a6c3301 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -5,12 +5,12 @@ #ifndef __ASM_SPINLOCK_H #define __ASM_SPINLOCK_H -#include #include #include /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 << 15) +#include /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index 8a88eb265516..6ce2117e49f6 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -10,7 +10,6 @@ #define _ASM_SPINLOCK_H #include -#include #include @@ -27,5 +26,6 @@ static inline void queued_spin_unlock(struct qspinlock *lock) } #include +#include #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h index 7fc82a233f49..3a9a0b0c7465 100644 --- a/arch/sparc/include/asm/spinlock_64.h +++ b/arch/sparc/include/asm/spinlock_64.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include #endif /* !(__ASSEMBLY__) */ diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index 584b0de6f2ca..41c449ece2d8 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -12,8 +12,8 @@ #define _XTENSA_SPINLOCK_H #include -#include #include +#include #define smp_mb__after_spinlock() smp_mb() diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 352a9ebb9f61..c0e26b824081 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -14,7 +14,8 @@ #include #include -#include + +/* Must be included from asm/spinlock.h after defining arch_spin_is_locked. */ /* * Writer states & reader shift and bias. diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 909b0bf22a1e..b94f3831e963 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,7 +12,6 @@ #include #include #include -#include /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock -- Gitee From 6b79349ace821311aff081d7da5472b064d763ac Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:37:37 +0800 Subject: [PATCH 013/158] sched: Add needbreak for rwlocks mainline inclusion from mainline-v5.12-rc1 commit a09a689a534183c48f200bc2de1ae61ae9c462ad category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a09a689a534183c48f200bc2de1ae61ae9c462ad ---------------------------------------------------------------------- Contention awareness while holding a spin lock is essential for reducing latency when long running kernel operations can hold that lock. Add the same contention detection interface for read/write spin locks. CC: Ingo Molnar CC: Will Deacon Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Acked-by: Waiman Long Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-8-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- include/linux/sched.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index d39427f8044d..31610358216e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2024,6 +2024,23 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Check if a rwlock is contended. + * Returns non-zero if there is another task waiting on the rwlock. + * Returns zero if the lock is not contended or the system / underlying + * rwlock implementation does not support contention detection. + * Technically does not depend on CONFIG_PREEMPTION, but a general need + * for low latency. + */ +static inline int rwlock_needbreak(rwlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return rwlock_is_contended(lock); +#else + return 0; +#endif +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); -- Gitee From ee9d8c9d355d068d0d6de5014da4856af51ba841 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:39:30 +0800 Subject: [PATCH 014/158] sched: Add cond_resched_rwlock mainline inclusion from mainline-v5.12-rc1 commit f3d4b4b1dc1c5fb9ea17cac14133463bfe72f170 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f3d4b4b1dc1c5fb9ea17cac14133463bfe72f170 ---------------------------------------------------------------------- Safely rescheduling while holding a spin lock is essential for keeping long running kernel operations running smoothly. Add the facility to cond_resched rwlocks. CC: Ingo Molnar CC: Will Deacon Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Acked-by: Waiman Long Acked-by: Paolo Bonzini Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-9-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- include/linux/sched.h | 12 ++++++++++++ kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 31610358216e..c04a313a9dbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1995,12 +1995,24 @@ static inline int _cond_resched(void) { return 0; } }) extern int __cond_resched_lock(spinlock_t *lock); +extern int __cond_resched_rwlock_read(rwlock_t *lock); +extern int __cond_resched_rwlock_write(rwlock_t *lock); #define cond_resched_lock(lock) ({ \ ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) +#define cond_resched_rwlock_read(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_read(lock); \ +}) + +#define cond_resched_rwlock_write(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_write(lock); \ +}) + static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8c8c946ee1de..9cc4928c5dfa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7263,6 +7263,46 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +int __cond_resched_rwlock_read(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_read(lock); + + if (rwlock_needbreak(lock) || resched) { + read_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + read_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_read); + +int __cond_resched_rwlock_write(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_write(lock); + + if (rwlock_needbreak(lock) || resched) { + write_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + write_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_write); + /** * yield - yield the current processor to other threads. * -- Gitee From 91f65ea3582813e742f3ebca127c6f5ed8fda812 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:44:48 +0800 Subject: [PATCH 015/158] KVM: x86/mmu: Use an rwlock for the x86 MMU mainline inclusion from mainline-v5.12-rc1 commit 531810caa9f4bc99ffbb90e09256792c56a6b07a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=531810caa9f4bc99ffbb90e09256792c56a6b07a ---------------------------------------------------------------------- Add a read / write lock to be used in place of the MMU spinlock on x86. The rwlock will enable the TDP MMU to handle page faults, and other operations in parallel in future commits. Reviewed-by: Peter Feiner Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-19-bgardon@google.com> [Introduce virt/kvm/mmu_lock.h - Paolo] Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c virt/kvm/dirty_ring.c virt/kvm/kvm_main.c fs/proc/etmem_scan.c Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/mmu/mmu.c | 90 ++++++++++++++++----------------- arch/x86/kvm/mmu/page_track.c | 8 +-- arch/x86/kvm/mmu/paging_tmpl.h | 8 +-- arch/x86/kvm/mmu/tdp_mmu.c | 20 ++++---- arch/x86/kvm/x86.c | 4 +- fs/proc/etmem_scan.c | 46 +++++++++++++++++ include/linux/kvm_host.h | 5 ++ virt/kvm/kvm_main.c | 37 +++++++------- virt/kvm/mmu_lock.h | 23 +++++++++ 10 files changed, 161 insertions(+), 82 deletions(-) create mode 100644 virt/kvm/mmu_lock.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3dda2d99e9d8..8dfa0bd8d3f2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -355,6 +355,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_HAVE_MMU_RWLOCK + struct kvm_mmu_page; /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 89ad6239138e..79ddc61af1f2 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1998,9 +1998,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); - cond_resched_lock(&vcpu->kvm->mmu_lock); + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } @@ -2452,7 +2452,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - @@ -2463,7 +2463,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -2474,7 +2474,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); @@ -2482,7 +2482,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return r; } @@ -3176,7 +3176,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3199,7 +3199,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3220,16 +3220,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu)) { - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return INVALID_PAGE; } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } @@ -3439,17 +3439,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) !smp_load_acquire(&sp->unsync_children)) return; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); mmu_sync_children(vcpu, sp); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { @@ -3463,7 +3463,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); @@ -3761,7 +3761,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return r; r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; r = make_mmu_pages_available(vcpu); @@ -3776,7 +3776,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -5044,7 +5044,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ mmu_topup_memory_caches(vcpu, true); - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5076,7 +5076,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) @@ -5277,14 +5277,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, if (iterator.rmap) flush |= fn(kvm, iterator.rmap); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { kvm_flush_remote_tlbs_with_address(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); } } @@ -5440,7 +5440,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } @@ -5474,7 +5474,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* @@ -5501,7 +5501,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) if (kvm->arch.tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) @@ -5543,7 +5543,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5567,7 +5567,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_flush_remote_tlbs(kvm); } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, @@ -5582,12 +5582,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); if (kvm->arch.tdp_mmu_enabled) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); /* * We can flush all the TLBs out of the mmu lock without TLB @@ -5647,13 +5647,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); if (kvm->arch.tdp_mmu_enabled) kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, @@ -5676,11 +5676,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); if (kvm->arch.tdp_mmu_enabled) flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); /* * It's also safe to flush TLBs out of mmu lock here as currently this @@ -5698,12 +5698,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); if (kvm->arch.tdp_mmu_enabled) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5715,11 +5715,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); if (kvm->arch.tdp_mmu_enabled) flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5732,14 +5732,14 @@ void kvm_mmu_zap_all(struct kvm *kvm) LIST_HEAD(invalid_list); int ign; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - if (cond_resched_lock(&kvm->mmu_lock)) + if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } @@ -5748,7 +5748,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) if (kvm->arch.tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) @@ -5808,7 +5808,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) continue; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm_has_zapped_obsolete_pages(kvm)) { kvm_mmu_commit_zap_page(kvm, @@ -5819,7 +5819,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); /* @@ -6052,7 +6052,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; @@ -6076,15 +6076,15 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) WARN_ON_ONCE(sp->lpage_disallowed); } - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); flush = false; } } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 81cf4babbd0b..20e62efd6457 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_del_rcu(&n->node); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c6daeeff1d9c..263b85384ef8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, } r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 10543f297718..a0c6f04d4303 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -59,7 +59,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, struct kvm_mmu_page *root) { - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) return false; @@ -117,7 +117,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(root->root_count); WARN_ON(!root->tdp_mmu_page); @@ -170,13 +170,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return root; } } @@ -186,7 +186,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) list_add(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return root; } @@ -421,7 +421,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *root = sptep_to_sp(root_pt); int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); @@ -492,13 +492,13 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, if (iter->next_last_level_gfn == iter->yielded_gfn) return false; - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { rcu_read_unlock(); if (flush) kvm_flush_remote_tlbs(kvm); - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); rcu_read_lock(); WARN_ON(iter->gfn > iter->next_last_level_gfn); @@ -1108,7 +1108,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root; int root_as_id; - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) @@ -1274,7 +1274,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, int root_as_id; bool spte_set = false; - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c5f09a8a5c9..4ff95c6a50ac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7548,9 +7548,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (vcpu->arch.mmu->direct_map) { unsigned int indirect_shadow_pages; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); if (indirect_shadow_pages) kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c index e9cad598189a..8e200a1e0b67 100644 --- a/fs/proc/etmem_scan.c +++ b/fs/proc/etmem_scan.c @@ -314,7 +314,12 @@ static int vm_walk_host_range(unsigned long long start, unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; pic->gpa_to_hva = 0; + +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif down_read(&walk->mm->mmap_lock); local_irq_disable(); ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, @@ -537,13 +542,21 @@ static int ept_page_range(struct page_idle_ctrl *pic, WARN_ON(addr >= end); +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock_irq(&pic->kvm->mmu_lock); +#else spin_lock_irq(&pic->kvm->mmu_lock); +#endif vcpu = kvm_get_vcpu(pic->kvm, 0); if (!vcpu) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-VCPU"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif return -EINVAL; } @@ -551,7 +564,11 @@ static int ept_page_range(struct page_idle_ctrl *pic, if (!VALID_PAGE(mmu->root_hpa)) { pic->gpa_to_hva = 0; set_restart_gpa(TASK_SIZE, "NO-HPA"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif return -EINVAL; } @@ -567,7 +584,11 @@ static int ept_page_range(struct page_idle_ctrl *pic, * and RET_RESCAN_FLAG will be set in ret value */ if (!(err & RET_RESCAN_FLAG)) +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else spin_unlock_irq(&pic->kvm->mmu_lock); +#endif else err &= ~RET_RESCAN_FLAG; @@ -584,7 +605,11 @@ static int ept_idle_supports_cpu(struct kvm *kvm) if (!vcpu) return -EINVAL; +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else spin_lock(&kvm->mmu_lock); +#endif mmu = kvm_arch_mmu_pointer(vcpu); if (kvm_mmu_ad_disabled(mmu)) { printk(KERN_NOTICE "CPU does not support EPT A/D bits tracking\n"); @@ -595,7 +620,11 @@ static int ept_idle_supports_cpu(struct kvm *kvm) ret = -EINVAL; } else ret = 0; +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else spin_unlock(&kvm->mmu_lock); +#endif return ret; } @@ -724,9 +753,17 @@ static int arm_page_range(struct page_idle_ctrl *pic, WARN_ON(addr >= end); +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&pic->kvm->mmu_lock); +#else spin_lock(&pic->kvm->mmu_lock); +#endif pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&pic->kvm->mmu_lock); +#else spin_unlock(&pic->kvm->mmu_lock); +#endif local_irq_disable(); do { @@ -1045,10 +1082,19 @@ static int page_scan_release(struct inode *inode, struct file *file) goto out; } #ifdef CONFIG_X86_64 +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else spin_lock(&kvm->mmu_lock); +#endif + kvm_flush_remote_tlbs(kvm); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else spin_unlock(&kvm->mmu_lock); #endif +#endif out: module_put(THIS_MODULE); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 066a45319f3b..97407c065b3e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -445,7 +445,12 @@ struct kvm_memslots { }; struct kvm { +#ifdef KVM_HAVE_MMU_RWLOCK + rwlock_t mmu_lock; +#else spinlock_t mmu_lock; +#endif /* KVM_HAVE_MMU_RWLOCK */ + struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 62d52df216b3..3ec37dd51a62 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -58,6 +58,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "mmu_lock.h" #include "vfio.h" #define CREATE_TRACE_POINTS @@ -517,13 +518,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, int idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + + KVM_MMU_LOCK(kvm); + kvm->mmu_notifier_seq++; if (kvm_set_spte_hva(kvm, address, pte)) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); } @@ -534,7 +537,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, int need_tlb_flush = 0, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and @@ -547,7 +550,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, if (need_tlb_flush || kvm->tlbs_dirty) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); kvm_arch_guest_memory_reclaimed(kvm); srcu_read_unlock(&kvm->srcu, idx); @@ -559,7 +562,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have @@ -573,7 +576,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, * in conjunction with the smp_rmb in mmu_notifier_retry(). */ kvm->mmu_notifier_count--; - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); BUG_ON(kvm->mmu_notifier_count < 0); } @@ -587,13 +590,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, int young, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; @@ -608,7 +611,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, int young, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is @@ -623,7 +626,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * more sophisticated heuristic later. */ young = kvm_age_hva(kvm, start, end); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; @@ -637,9 +640,9 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, int young, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); young = kvm_test_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; @@ -818,7 +821,7 @@ static struct kvm *kvm_create_vm(unsigned long type) if (!kvm) return ERR_PTR(-ENOMEM); - spin_lock_init(&kvm->mmu_lock); + KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); @@ -1606,7 +1609,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1622,7 +1625,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); } if (flush) @@ -1713,7 +1716,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (offset = log->first_page, i = offset / BITS_PER_LONG, n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { @@ -1736,7 +1739,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, offset, mask); } } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h new file mode 100644 index 000000000000..9e1308f9734c --- /dev/null +++ b/virt/kvm/mmu_lock.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_MMU_LOCK_H +#define KVM_MMU_LOCK_H 1 + +/* + * Architectures can choose whether to use an rwlock or spinlock + * for the mmu_lock. These macros, for use in common code + * only, avoids using #ifdefs in places that must deal with + * multiple architectures. + */ + +#ifdef KVM_HAVE_MMU_RWLOCK +#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) +#else +#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#endif /* KVM_HAVE_MMU_RWLOCK */ + +#endif -- Gitee From fd0bf5da7f3c32d900c4d36a29980125b8fd9897 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Feb 2021 06:29:28 -0500 Subject: [PATCH 016/158] i915: kvmgt: the KVM mmu_lock is now an rwlock mainline inclusion from mainline-v5.12-rc1 commit e36b250e502a6145fa95a3d93af83ba66fc6ae1b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e36b250e502a6145fa95a3d93af83ba66fc6ae1b ---------------------------------------------------------------------- Adjust the KVMGT page tracking callbacks. Cc: Zhenyu Wang Cc: Zhi Wang Cc: intel-gvt-dev@lists.freedesktop.org Cc: intel-gfx@lists.freedesktop.org Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- drivers/gpu/drm/i915/gvt/kvmgt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 778eb8cab610..26c56c3a08a3 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1703,7 +1703,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn) return -EINVAL; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvmgt_gfn_is_write_protected(info, gfn)) goto out; @@ -1712,7 +1712,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn) kvmgt_protect_table_add(info, gfn); out: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); return 0; } @@ -1737,7 +1737,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) return -EINVAL; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (!kvmgt_gfn_is_write_protected(info, gfn)) goto out; @@ -1746,7 +1746,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) kvmgt_protect_table_del(info, gfn); out: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); return 0; } @@ -1772,7 +1772,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvmgt_guest_info *info = container_of(node, struct kvmgt_guest_info, track_node); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < slot->npages; i++) { gfn = slot->base_gfn + i; if (kvmgt_gfn_is_write_protected(info, gfn)) { @@ -1781,7 +1781,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, kvmgt_protect_table_del(info, gfn); } } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm) -- Gitee From 13a36e040aea516eca6a854ad96fa321e65a3359 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:46:57 +0800 Subject: [PATCH 017/158] KVM: x86/mmu: Factor out functions to add/remove TDP MMU pages mainline inclusion from mainline-v5.12-rc1 commit a9442f594147f95307f691cfba0c31e25dc79b9d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a9442f594147f95307f691cfba0c31e25dc79b9d ---------------------------------------------------------------------- Move the work of adding and removing TDP MMU pages to/from "secondary" data structures to helper functions. These functions will be built on in future commits to enable MMU operations to proceed (mostly) in parallel. No functional change expected. Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-20-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 47 +++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a0c6f04d4303..42a3d4d0e3ce 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -262,6 +262,39 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } } +/** + * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the new page + * @account_nx: This page replaces a NX large page and should be marked for + * eventual reclaim. + */ +static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool account_nx) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); +} + +/** + * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the page to be removed + */ +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + list_del(&sp->link); + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); +} + /** * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure * @@ -281,10 +314,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) trace_kvm_mmu_prepare_zap_page(sp); - list_del(&sp->link); - - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + tdp_mmu_unlink_page(kvm, sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { old_child_spte = READ_ONCE(*(pt + i)); @@ -707,15 +737,16 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (!is_shadow_present_pte(iter.old_spte)) { sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); - list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); child_pt = sp->spt; + + tdp_mmu_link_page(vcpu->kvm, sp, + huge_page_disallowed && + req_level >= iter.level); + new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); trace_kvm_mmu_get_page(sp, true); - if (huge_page_disallowed && req_level >= iter.level) - account_huge_nx_page(vcpu->kvm, sp); - tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); } } -- Gitee From 676778905e78c70662e81623a9bea30b9daa4641 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:48:32 +0800 Subject: [PATCH 018/158] KVM: x86/mmu: Use atomic ops to set SPTEs in TDP MMU map mainline inclusion from mainline-v5.12-rc1 commit 9a77daacc87dee9fd63e31243f21894132ed8407 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9a77daacc87dee9fd63e31243f21894132ed8407 ---------------------------------------------------------------------- To prepare for handling page faults in parallel, change the TDP MMU page fault handler to use atomic operations to set SPTEs so that changes are not lost if multiple threads attempt to modify the same SPTE. Reviewed-by: Peter Feiner Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-21-bgardon@google.com> [Document new locking rules. - Paolo] Signed-off-by: Paolo Bonzini conflict: arch/x86/include/asm/kvm_host.h Signed-off-by: Yu Zhang --- Documentation/virt/kvm/locking.rst | 9 +- arch/x86/include/asm/kvm_host.h | 14 +++ arch/x86/kvm/mmu/tdp_mmu.c | 142 ++++++++++++++++++++++------- 3 files changed, 131 insertions(+), 34 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index b21a34c34a21..0aa4817b466d 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows: - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring them together is quite rare. -On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. +On x86: + +- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock + +- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is + taken inside kvm->arch.mmu_lock, and cannot be taken without already + holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise + there's no need to take kvm->arch.tdp_mmu_pages_lock at all). Everything else is a leaf: no other lock is taken inside the critical sections. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8dfa0bd8d3f2..40222d9696f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1044,6 +1044,20 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* List of struct tdp_mmu_pages not being used as roots */ struct list_head tdp_mmu_pages; + + /* + * Protects accesses to the following fields when the MMU lock + * is held in read mode: + * - tdp_mmu_pages (above) + * - the link field of struct kvm_mmu_pages used by the TDP MMU + * - lpage_disallowed_mmu_pages + * - the lpage_disallowed_link field of struct kvm_mmu_pages used + * by the TDP MMU + * It is acceptable, but not necessary, to acquire this lock when + * the thread holds the MMU lock in write mode. + */ + spinlock_t tdp_mmu_pages_lock; + /* * VM-scope maximum vCPU ID. Used to determine the size of structures * that increase along with the maximum vCPU ID, in which case, using diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 42a3d4d0e3ce..5d47260e0048 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,6 +7,7 @@ #include "tdp_mmu.h" #include "spte.h" +#include #include #ifdef CONFIG_X86_64 @@ -33,6 +34,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); } @@ -225,7 +227,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level); + u64 old_spte, u64 new_spte, int level, + bool shared); static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) { @@ -267,17 +270,26 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the new page + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. * @account_nx: This page replaces a NX large page and should be marked for * eventual reclaim. */ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool account_nx) + bool shared, bool account_nx) { - lockdep_assert_held_write(&kvm->mmu_lock); + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) account_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -285,14 +297,24 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, * * @kvm: kvm instance * @sp: the page to be removed + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. */ -static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) { - lockdep_assert_held_write(&kvm->mmu_lock); + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); list_del(&sp->link); if (sp->lpage_disallowed) unaccount_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -300,28 +322,39 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp) * * @kvm: kvm instance * @pt: the page removed from the paging structure + * @shared: This operation may not be running under the exclusive use + * of the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) +static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, + bool shared) { struct kvm_mmu_page *sp = sptep_to_sp(pt); int level = sp->role.level; gfn_t gfn = sp->gfn; u64 old_child_spte; + u64 *sptep; int i; trace_kvm_mmu_prepare_zap_page(sp); - tdp_mmu_unlink_page(kvm, sp); + tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - old_child_spte = READ_ONCE(*(pt + i)); - WRITE_ONCE(*(pt + i), 0); + sptep = pt + i; + + if (shared) { + old_child_spte = xchg(sptep, 0); + } else { + old_child_spte = READ_ONCE(*sptep); + WRITE_ONCE(*sptep, 0); + } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), - old_child_spte, 0, level - 1); + old_child_spte, 0, level - 1, shared); } kvm_flush_remote_tlbs_with_address(kvm, gfn, @@ -338,12 +371,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change * @level: the level of the PT the SPTE is part of in the paging structure + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. * * Handle bookkeeping that might result from the modification of a SPTE. * This function must be called for all TDP SPTE modifications. */ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); @@ -415,18 +452,51 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (was_present && !was_leaf && (pfn_changed || !is_present)) handle_removed_tdp_mmu_page(kvm, - spte_to_child_pt(old_spte, level)); + spte_to_child_pt(old_spte, level), shared); } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) + u64 old_spte, u64 new_spte, int level, + bool shared) { - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, + shared); handle_changed_spte_acc_track(old_spte, new_spte, level); handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, level); } +/* + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the + * associated bookkeeping + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + lockdep_assert_held_read(&kvm->mmu_lock); + + if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, + new_spte) != iter->old_spte) + return false; + + handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level, true); + + return true; +} + + /* * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: kvm instance @@ -456,7 +526,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level); + iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); @@ -632,23 +702,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, int ret = RET_PF_FIXED; int make_spte_ret = 0; - if (unlikely(is_noslot_pfn(pfn))) { + if (unlikely(is_noslot_pfn(pfn))) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); - trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, - new_spte); - } else { + else make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, &new_spte); - trace_kvm_mmu_set_spte(iter->level, iter->gfn, - rcu_dereference(iter->sptep)); - } if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else - tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + return RET_PF_RETRY; /* * If the page fault was caused by a write but the page is write @@ -662,8 +727,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) + if (unlikely(is_mmio_spte(new_spte))) { + trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, + new_spte); ret = RET_PF_EMULATE; + } else + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); trace_kvm_mmu_set_spte(iter->level, iter->gfn, rcu_dereference(iter->sptep)); @@ -722,7 +792,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0)) + break; kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, KVM_PAGES_PER_HPAGE(iter.level)); @@ -739,19 +810,24 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); child_pt = sp->spt; - tdp_mmu_link_page(vcpu->kvm, sp, - huge_page_disallowed && - req_level >= iter.level); - new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - trace_kvm_mmu_get_page(sp, true); - tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, + new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, true, + huge_page_disallowed && + req_level >= iter.level); + + trace_kvm_mmu_get_page(sp, true); + } else { + tdp_mmu_free_sp(sp); + break; + } } } - if (WARN_ON(iter.level != level)) { + if (iter.level != level) { rcu_read_unlock(); return RET_PF_RETRY; } -- Gitee From 1a281ae4e1750afe5a807e5519e1254d5984df1f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:51:47 +0800 Subject: [PATCH 019/158] KVM: x86/mmu: Flush TLBs after zap in TDP MMU PF handler mainline inclusion from mainline-v5.12-rc1 commit 08f07c800e9d35b59d0c8346333f189160bd67d4 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=08f07c800e9d35b59d0c8346333f189160bd67d4 ---------------------------------------------------------------------- When the TDP MMU is allowed to handle page faults in parallel there is the possiblity of a race where an SPTE is cleared and then imediately replaced with a present SPTE pointing to a different PFN, before the TLBs can be flushed. This race would violate architectural specs. Ensure that the TLBs are flushed properly before other threads are allowed to install any present value for the SPTE. Reviewed-by: Peter Feiner Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-22-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.h | 21 ++++++++++++- arch/x86/kvm/mmu/tdp_mmu.c | 63 ++++++++++++++++++++++++++++++++------ 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f07aee04e903..8a8e47bbf804 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -124,6 +124,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; PT64_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT +/* + * If a thread running without exclusive control of the MMU lock must perform a + * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * non-present intermediate value. Other threads which encounter this value + * should not modify the SPTE. + * + * This constant works because it is considered non-present on both AMD and + * Intel CPUs and does not create a L1TF vulnerability because the pfn section + * is zeroed out. + * + * Only used by the TDP MMU. + */ +#define REMOVED_SPTE (1ull << 59) + +static inline bool is_removed_spte(u64 spte) +{ + return spte == REMOVED_SPTE; +} + /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to @@ -181,7 +200,7 @@ static inline bool is_access_track_spte(u64 spte) static inline bool is_shadow_present_pte(u64 pte) { - return (pte != 0) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte); } static inline bool is_large_pte(u64 pte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 5d47260e0048..92d22ca2f529 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -427,15 +427,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE, it is - * unexpected. Log the change, though it should not impact the - * guest since both the former and current SPTEs are nonpresent. + * If this change does not involve a MMIO SPTE or removed SPTE, + * it is unexpected. Log the change, though it should not + * impact the guest since both the former and current SPTEs + * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + if (WARN_ON(!is_mmio_spte(old_spte) && + !is_mmio_spte(new_spte) && + !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" - "are MMIO SPTEs.\n" + "are MMIO SPTEs, or the new SPTE is\n" + "a temporary removed SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -486,6 +490,13 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); + /* + * Do not change removed SPTEs. Only the thread that froze the SPTE + * may modify it. + */ + if (iter->old_spte == REMOVED_SPTE) + return false; + if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -496,6 +507,34 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, return true; } +static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) +{ + /* + * Freeze the SPTE by setting it to a special, + * non-present value. This will stop other threads from + * immediately installing a present entry in its place + * before the TLBs are flushed. + */ + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + return false; + + kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, + KVM_PAGES_PER_HPAGE(iter->level)); + + /* + * No other thread can overwrite the removed SPTE as they + * must either wait on the MMU lock or use + * tdp_mmu_set_spte_atomic which will not overrite the + * special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present + * to non-present. + */ + WRITE_ONCE(*iter->sptep, 0); + + return true; +} + /* * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping @@ -523,6 +562,15 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, lockdep_assert_held_write(&kvm->mmu_lock); + /* + * No thread should be using this function to set SPTEs to the + * temporary removed SPTE value. + * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic + * should be used. If operating under the MMU lock in write mode, the + * use of the removed SPTE should not be necessary. + */ + WARN_ON(iter->old_spte == REMOVED_SPTE); + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, @@ -792,12 +840,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0)) + if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) break; - kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, - KVM_PAGES_PER_HPAGE(iter.level)); - /* * The iter must explicitly re-read the spte here * because the new value informs the !present -- Gitee From 240cdbe3b1df41ed1de54c18eca7cb12dfc0efc0 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:53:17 +0800 Subject: [PATCH 020/158] KVM: x86/mmu: Mark SPTEs in disconnected pages as removed mainline inclusion from mainline-v5.12-rc1 commit e25f0e0cd51bfb1e2e6121373c68f1427266bba7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e25f0e0cd51bfb1e2e6121373c68f1427266bba7 ---------------------------------------------------------------------- When clearing TDP MMU pages what have been disconnected from the paging structure root, set the SPTEs to a special non-present value which will not be overwritten by other threads. This is needed to prevent races in which a thread is clearing a disconnected page table, but another thread has already acquired a pointer to that memory and installs a mapping in an already cleared entry. This can lead to memory leaks and accounting errors. Reviewed-by: Peter Feiner Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-23-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 92d22ca2f529..950eb0366c74 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -334,9 +334,10 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, { struct kvm_mmu_page *sp = sptep_to_sp(pt); int level = sp->role.level; - gfn_t gfn = sp->gfn; + gfn_t base_gfn = sp->gfn; u64 old_child_spte; u64 *sptep; + gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -345,16 +346,39 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, for (i = 0; i < PT64_ENT_PER_PAGE; i++) { sptep = pt + i; + gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); if (shared) { - old_child_spte = xchg(sptep, 0); + /* + * Set the SPTE to a nonpresent value that other + * threads will not overwrite. If the SPTE was + * already marked as removed then another thread + * handling a page fault could overwrite it, so + * set the SPTE until it is set from some other + * value to the removed SPTE value. + */ + for (;;) { + old_child_spte = xchg(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_child_spte)) + break; + cpu_relax(); + } } else { old_child_spte = READ_ONCE(*sptep); - WRITE_ONCE(*sptep, 0); + + /* + * Marking the SPTE as a removed SPTE is not + * strictly necessary here as the MMU lock will + * stop other threads from concurrently modifying + * this SPTE. Using the removed SPTE value keeps + * the two branches consistent and simplifies + * the function. + */ + WRITE_ONCE(*sptep, REMOVED_SPTE); } - handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), - gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), - old_child_spte, 0, level - 1, shared); + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, + old_child_spte, REMOVED_SPTE, level - 1, + shared); } kvm_flush_remote_tlbs_with_address(kvm, gfn, -- Gitee From 8cb410ecc1bb3f9ec1e0bb28850d702858afbc89 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 15:54:44 +0800 Subject: [PATCH 021/158] KVM: x86/mmu: Allow parallel page faults for the TDP MMU mainline inclusion from mainline-v5.12-rc1 commit a2855afc7ee88475e8feb16840b23f787bfc994d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a2855afc7ee88475e8feb16840b23f787bfc994d ---------------------------------------------------------------------- Make the last few changes necessary to enable the TDP MMU to handle page faults in parallel while holding the mmu_lock in read mode. Reviewed-by: Peter Feiner Signed-off-by: Ben Gardon Message-Id: <20210202185734.1680553-24-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 79ddc61af1f2..24333ad3a89c 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3761,7 +3761,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return r; r = RET_PF_RETRY; - write_lock(&vcpu->kvm->mmu_lock); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + read_lock(&vcpu->kvm->mmu_lock); + else + write_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; r = make_mmu_pages_available(vcpu); @@ -3776,7 +3781,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - write_unlock(&vcpu->kvm->mmu_lock); + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + read_unlock(&vcpu->kvm->mmu_lock); + else + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } -- Gitee From a48bc35ab1c3fed789ca1400feec79cc355b03a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 17:58:11 +0800 Subject: [PATCH 022/158] KVM: x86: Return bool instead of int for CR4 and SREGS validity checks mainline inclusion from mainline-v5.11-rc1 commit ee69c92bac61f4379e97f40b259a1c1257e5987f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ee69c92bac61f4379e97f40b259a1c1257e5987f ---------------------------------------------------------------------- Rework the common CR4 and SREGS checks to return a bool instead of an int, i.e. true/false instead of 0/-EINVAL, and add "is" to the name to clarify the polarity of the return value (which is effectively inverted by this change). No functional changed intended. Signed-off-by: Sean Christopherson Message-Id: <20201007014417.29276-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/x86.c Signed-off-by: Yu Zhang --- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 30 +++++++++++++----------------- arch/x86/kvm/x86.h | 2 +- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 0c164ca686c7..3aaf0bdc586d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -278,7 +278,7 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; return true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 23e68c7161da..8c2651560b54 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3364,7 +3364,7 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) /* * We operate under the default treatment of SMM, so VMX cannot be * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_valid_cr4(). + * handled by kvm_is_valid_cr4(). */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ff95c6a50ac..8247e7a3f84b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1056,20 +1056,17 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) - return -EINVAL; + return false; if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return -EINVAL; - - if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4)) - return -EINVAL; + return false; - return 0; + return kvm_x86_ops.is_valid_cr4(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_valid_cr4); +EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1078,7 +1075,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) X86_CR4_SMEP; unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; - if (kvm_valid_cr4(vcpu, cr4)) + if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -10095,7 +10092,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); -static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* @@ -10103,21 +10100,20 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) * 64-bit mode (though maybe in a 32-bit code segment). * CR4.PAE and EFER.LMA must be set. */ - if (!(sregs->cr4 & X86_CR4_PAE) - || !(sregs->efer & EFER_LMA)) - return -EINVAL; + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return false; if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) - return -EINVAL; + return false; } else { /* * Not in 64-bit mode: EFER.LMA is clear and the code * segment cannot be 64-bit. */ if (sregs->efer & EFER_LMA || sregs->cs.l) - return -EINVAL; + return false; } - return kvm_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -10128,7 +10124,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) struct desc_ptr dt; int ret = -EINVAL; - if (kvm_valid_sregs(vcpu, sregs)) + if (!kvm_is_valid_sregs(vcpu, sregs)) goto out; apic_base_msr.data = sregs->apic_base; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 20a1f446acdf..de81c17afd6b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -378,7 +378,7 @@ static inline bool kvm_dr6_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); -- Gitee From 49ebebbe49b42d71f76e251e886a04f43a5522d3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:00:48 +0800 Subject: [PATCH 023/158] KVM: selftests: Verify supported CR4 bits can be set before KVM_SET_CPUID2 mainline inclusion from mainline-v5.11-rc1 commit 7a873e4555679a0e749422db071c142b57f80be9 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7a873e4555679a0e749422db071c142b57f80be9 ---------------------------------------------------------------------- Extend the KVM_SET_SREGS test to verify that all supported CR4 bits, as enumerated by KVM, can be set before KVM_SET_CPUID2, i.e. without first defining the vCPU model. KVM is supposed to skip guest CPUID checks when host userspace is stuffing guest state. Check the inverse as well, i.e. that KVM rejects KVM_SET_REGS if CR4 has one or more unsupported bits set. Signed-off-by: Sean Christopherson Message-Id: <20201007014417.29276-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- .../selftests/kvm/include/x86_64/processor.h | 17 ++++ .../selftests/kvm/include/x86_64/vmx.h | 4 - .../selftests/kvm/x86_64/set_sregs_test.c | 92 ++++++++++++++++++- 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 55c69d982c3b..f6527704460e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -29,6 +29,7 @@ #define X86_CR4_OSFXSR (1ul << 9) #define X86_CR4_OSXMMEXCPT (1ul << 10) #define X86_CR4_UMIP (1ul << 11) +#define X86_CR4_LA57 (1ul << 12) #define X86_CR4_VMXE (1ul << 13) #define X86_CR4_SMXE (1ul << 14) #define X86_CR4_FSGSBASE (1ul << 16) @@ -38,6 +39,22 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) +/* CPUID.1.ECX */ +#define CPUID_VMX (1ul << 5) +#define CPUID_SMX (1ul << 6) +#define CPUID_PCID (1ul << 17) +#define CPUID_XSAVE (1ul << 26) + +/* CPUID.7.EBX */ +#define CPUID_FSGSBASE (1ul << 0) +#define CPUID_SMEP (1ul << 7) +#define CPUID_SMAP (1ul << 20) + +/* CPUID.7.ECX */ +#define CPUID_UMIP (1ul << 2) +#define CPUID_PKU (1ul << 3) +#define CPUID_LA57 (1ul << 16) + #define UNEXPECTED_VECTOR_PORT 0xfff0u /* General Registers in 64-Bit Mode */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index e78d7e26ba61..65eb1079a161 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -11,10 +11,6 @@ #include #include "processor.h" -#define CPUID_VMX_BIT 5 - -#define CPUID_VMX (1 << 5) - /* * Definitions of Primary Processor-Based VM-Execution Controls. */ diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 9f7656184f31..318be0bf77ab 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -24,16 +24,106 @@ #define VCPU_ID 5 +static void test_cr4_feature_bit(struct kvm_vm *vm, struct kvm_sregs *orig, + uint64_t feature_bit) +{ + struct kvm_sregs sregs; + int rc; + + /* Skip the sub-test, the feature is supported. */ + if (orig->cr4 & feature_bit) + return; + + memcpy(&sregs, orig, sizeof(sregs)); + sregs.cr4 |= feature_bit; + + rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + TEST_ASSERT(rc, "KVM allowed unsupported CR4 bit (0x%lx)", feature_bit); + + /* Sanity check that KVM didn't change anything. */ + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs"); +} + +static uint64_t calc_cr4_feature_bits(struct kvm_vm *vm) +{ + struct kvm_cpuid_entry2 *cpuid_1, *cpuid_7; + uint64_t cr4; + + cpuid_1 = kvm_get_supported_cpuid_entry(1); + cpuid_7 = kvm_get_supported_cpuid_entry(7); + + cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE | + X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE | + X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT; + if (cpuid_7->ecx & CPUID_UMIP) + cr4 |= X86_CR4_UMIP; + if (cpuid_7->ecx & CPUID_LA57) + cr4 |= X86_CR4_LA57; + if (cpuid_1->ecx & CPUID_VMX) + cr4 |= X86_CR4_VMXE; + if (cpuid_1->ecx & CPUID_SMX) + cr4 |= X86_CR4_SMXE; + if (cpuid_7->ebx & CPUID_FSGSBASE) + cr4 |= X86_CR4_FSGSBASE; + if (cpuid_1->ecx & CPUID_PCID) + cr4 |= X86_CR4_PCIDE; + if (cpuid_1->ecx & CPUID_XSAVE) + cr4 |= X86_CR4_OSXSAVE; + if (cpuid_7->ebx & CPUID_SMEP) + cr4 |= X86_CR4_SMEP; + if (cpuid_7->ebx & CPUID_SMAP) + cr4 |= X86_CR4_SMAP; + if (cpuid_7->ecx & CPUID_PKU) + cr4 |= X86_CR4_PKE; + + return cr4; +} + int main(int argc, char *argv[]) { struct kvm_sregs sregs; struct kvm_vm *vm; + uint64_t cr4; int rc; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - /* Create VM */ + /* + * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and + * use it to verify all supported CR4 bits can be set prior to defining + * the vCPU model, i.e. without doing KVM_SET_CPUID2. + */ + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + + sregs.cr4 |= calc_cr4_feature_bits(vm); + cr4 = sregs.cr4; + + rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)", + sregs.cr4, cr4); + + /* Verify all unsupported features are rejected by KVM. */ + test_cr4_feature_bit(vm, &sregs, X86_CR4_UMIP); + test_cr4_feature_bit(vm, &sregs, X86_CR4_LA57); + test_cr4_feature_bit(vm, &sregs, X86_CR4_VMXE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_SMXE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_FSGSBASE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_PCIDE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_OSXSAVE); + test_cr4_feature_bit(vm, &sregs, X86_CR4_SMEP); + test_cr4_feature_bit(vm, &sregs, X86_CR4_SMAP); + test_cr4_feature_bit(vm, &sregs, X86_CR4_PKE); + kvm_vm_free(vm); + + /* Create a "real" VM and verify APIC_BASE can be set. */ vm = vm_create_default(VCPU_ID, 0, NULL); vcpu_sregs_get(vm, VCPU_ID, &sregs); -- Gitee From 0921ddaa38c389d8adb564bc9b61d9784ddbc946 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:06:18 +0800 Subject: [PATCH 024/158] KVM: x86: Add a helper to check for a legal GPA mainline inclusion from mainline-v5.12-rc1 commit 4bda0e97868a95553ba71d87f593756e1ffd654b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4bda0e97868a95553ba71d87f593756e1ffd654b ---------------------------------------------------------------------- Add a helper to check for a legal GPA, and use it to consolidate code in existing, related helpers. Future patches will extend usage to VMX and SVM code, properly handle exceptions to the maxphyaddr rule, and add more helpers. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/cpuid.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a7126dd9470d..f32853ef88dd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -76,9 +76,19 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return !(gpa >> cpuid_maxphyaddr(vcpu)); +} + static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); + return !kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && kvm_vcpu_is_legal_gpa(vcpu, gpa); } /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ @@ -402,11 +412,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } -static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature) { -- Gitee From d46cbf22e5695d144b8b518213868f83292e4ba1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:07:43 +0800 Subject: [PATCH 025/158] KVM: x86: Add a helper to handle legal GPA with an alignment requirement mainline inclusion from mainline-v5.12-rc1 commit da6c6a7c06e268f53c0560edc9dff372f11218f5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=da6c6a7c06e268f53c0560edc9dff372f11218f5 ---------------------------------------------------------------------- Add a helper to genericize checking for a legal GPA that also must conform to an arbitrary alignment, and use it in the existing page_address_valid(). Future patches will replace open coded variants in VMX and SVM. Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/cpuid.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f32853ef88dd..8ac43a45d625 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -86,9 +86,15 @@ static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return !kvm_vcpu_is_legal_gpa(vcpu, gpa); } +static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t alignment) +{ + return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) { - return PAGE_ALIGNED(gpa) && kvm_vcpu_is_legal_gpa(vcpu, gpa); + return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ -- Gitee From f1c521778850eb2d22a442564790dcaa0c15396c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:09:28 +0800 Subject: [PATCH 026/158] KVM: VMX: Use GPA legality helpers to replace open coded equivalents mainline inclusion from mainline-v5.12-rc1 commit 636e8b733491135aacf9d1dfd23d9e77637f1198 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=636e8b733491135aacf9d1dfd23d9e77637f1198 ---------------------------------------------------------------------- Replace a variety of open coded GPA checks with the recently introduced common helpers. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-6-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/vmx/nested.c | 26 +++++++------------------- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1686e85bff13..50b8015726e2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -748,8 +748,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, (CC(!nested_cpu_has_vid(vmcs12)) || CC(!nested_exit_intr_ack_set(vcpu)) || CC((vmcs12->posted_intr_nv & 0xff00)) || - CC((vmcs12->posted_intr_desc_addr & 0x3f)) || - CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) + CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -762,13 +761,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { - int maxphyaddr; - if (count == 0) return 0; - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) + + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || + !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; return 0; @@ -1066,14 +1063,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, } } -static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long invalid_mask; - - invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - return (val & invalid_mask) == 0; -} - /* * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't @@ -1125,7 +1114,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, enum vm_entry_failure_code *entry_failure_code) { - if (CC(!nested_cr3_valid(vcpu, cr3))) { + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2671,7 +2660,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ switch (new_eptp & VMX_EPTP_MT_MASK) { @@ -2702,7 +2690,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ @@ -2897,7 +2885,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) + CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8c2651560b54..b5a57b0a8f5b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1204,7 +1204,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ - return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); + return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) -- Gitee From 540a3dbca8b493af902954e6b7b1bb89b707ab72 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:13:59 +0800 Subject: [PATCH 027/158] KVM: nSVM: Use common GPA helper to check for illegal CR3 mainline inclusion from mainline-v5.12-rc1 commit bbc2c63ddd51cd6d349e3fe0010f9b7b259e58ea category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bbc2c63ddd51cd6d349e3fe0010f9b7b259e58ea ---------------------------------------------------------------------- Replace an open coded check for an invalid CR3 with its equivalent helper. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-7-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/svm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3aaf0bdc586d..096f9526dfee 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -372,7 +372,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && -- Gitee From 19cb41e98efe3a2cd17f50f4e75522c385b85b96 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:16:02 +0800 Subject: [PATCH 028/158] KVM: x86: SEV: Treat C-bit as legal GPA bit regardless of vCPU mode mainline inclusion from mainline-v5.12-rc1 commit ca29e14506bd66d50733c1f3e4448aba54e70cc7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ca29e14506bd66d50733c1f3e4448aba54e70cc7 ---------------------------------------------------------------------- Rename cr3_lm_rsvd_bits to reserved_gpa_bits, and use it for all GPA legality checks. AMD's APM states: If the C-bit is an address bit, this bit is masked from the guest physical address when it is translated through the nested page tables. Thus, any access that can conceivably be run through NPT should ignore the C-bit when checking for validity. For features that KVM emulates in software, e.g. MTRRs, there is no clear direction in the APM for how the C-bit should be handled. For such cases, follow the SME behavior inasmuch as possible, since SEV is is essentially a VM-specific variant of SME. For SME, the APM states: In this case the upper physical address bits are treated as reserved when the feature is enabled except where otherwise indicated. Collecting the various relavant SME snippets in the APM and cross- referencing the omissions with Linux kernel code, this leaves MTTRs and APIC_BASE as the only flows that KVM emulates that should _not_ ignore the C-bit. Note, this means the reserved bit checks in the page tables are technically broken. This will be remedied in a future patch. Although the page table checks are technically broken, in practice, it's all but guaranteed to be irrelevant. NPT is required for SEV, i.e. shadowing page tables isn't needed in the common case. Theoretically, the checks could be in play for nested NPT, but it's extremely unlikely that anyone is running nested VMs on SEV, as doing so would require L1 to expose sensitive data to L0, e.g. the entire VMCB. And if anyone is running nested VMs, L0 can't read the guest's encrypted memory, i.e. L1 would need to put its NPT in shared memory, in which case the C-bit will never be set. Or, L1 could use shadow paging, but again, if L0 needs to read page tables, e.g. to load PDPTRs, the memory can't be encrypted if L1 has any expectation of L0 doing the right thing. Cc: Tom Lendacky Cc: Brijesh Singh Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-8-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 2 +- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/x86.c | 7 +++---- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 40222d9696f3..f2433f21a435 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -662,7 +662,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - unsigned long cr3_lm_rsvd_bits; + u64 reserved_gpa_bits; int maxphyaddr; /* emulate context */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6c69733d2c98..a8d412c06c7c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -222,7 +222,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + vcpu->arch.reserved_gpa_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_ops.vcpu_after_set_cpuid(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8ac43a45d625..b9871e4f1396 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -78,7 +78,7 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - return !(gpa >> cpuid_maxphyaddr(vcpu)); + return !(gpa & vcpu->arch.reserved_gpa_bits); } static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 096f9526dfee..f85580a9aded 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -275,7 +275,7 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) if (vmcb12_lma) { if (!(vmcb12->save.cr4 & X86_CR4_PAE) || !(vmcb12->save.cr0 & X86_CR0_PE) || - (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) return false; } if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9f6d8ea4b57c..c10435ae3158 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3777,7 +3777,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) { best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); if (best) - vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f)); + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } if (!kvm_vcpu_apicv_active(vcpu)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8247e7a3f84b..0ed657b01cbd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1128,8 +1128,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && - (cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; else if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) @@ -10102,7 +10101,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return false; - if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) + if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) return false; } else { /* @@ -10449,7 +10448,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + vcpu->arch.reserved_gpa_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- Gitee From def4cf0d256f2f229e68d6e5a8455ef4cd1536ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:17:58 +0800 Subject: [PATCH 029/158] KVM: x86: Use reserved_gpa_bits to calculate reserved PxE bits mainline inclusion from mainline-v5.12-rc1 commit 5b7f575ccd29eb1a0b013961bee5957574046094 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5b7f575ccd29eb1a0b013961bee5957574046094 ---------------------------------------------------------------------- Use reserved_gpa_bits, which accounts for exceptions to the maxphyaddr rule, e.g. SEV's C-bit, for the page {table,directory,etc...} entry (PxE) reserved bits checks. For SEV, the C-bit is ignored by hardware when walking pages tables, e.g. the APM states: Note that while the guest may choose to set the C-bit explicitly on instruction pages and page table addresses, the value of this bit is a don't-care in such situations as hardware always performs these as private accesses. Such behavior is expected to hold true for other features that repurpose GPA bits, e.g. KVM could theoretically emulate SME or MKTME, which both allow non-zero repurposed bits in the page tables. Conceptually, KVM should apply reserved GPA checks universally, and any features that do not adhere to the basic rule should be explicitly handled, i.e. if a GPA bit is repurposed but not allowed in page tables for whatever reason. Refactor __reset_rsvds_bits_mask() to take the pre-generated reserved bits mask, and opportunistically clean up its code, e.g. to align lines and comments. Practically speaking, this is change is a likely a glorified nop given the current KVM code base. SEV's C-bit is the only repurposed GPA bit, and KVM doesn't support shadowing encrypted page tables (which is theoretically possible via SEV debug APIs). Cc: Rick Edgecombe Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-9-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/cpuid.c Signed-off-by: Yu Zhang --- arch/x86/kvm/cpuid.c | 10 ++-- arch/x86/kvm/mmu/mmu.c | 104 ++++++++++++++++++++--------------------- arch/x86/kvm/x86.c | 3 +- 3 files changed, 58 insertions(+), 59 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a8d412c06c7c..06a20ff9c5be 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -216,16 +216,20 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - kvm_mmu_reset_context(vcpu); + vcpu->arch.reserved_gpa_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - vcpu->arch.reserved_gpa_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); - /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + + /* + * Except for the MMU, which needs to be reset after any vendor + * specific adjustments to the reserved GPA bits. + */ + kvm_mmu_reset_context(vcpu); } static int is_efer_nx(void) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 24333ad3a89c..1e5621cb9b3e 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4028,20 +4028,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, static void __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, int level, bool nx, bool gbpages, + u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { - u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; + u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; - if (!nx) - exb_bit_rsvd = rsvd_bits(63, 63); if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); + if (level == PT32E_ROOT_LEVEL) + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); + else + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + + /* Note, NX doesn't exist in PDPTEs, this is handled below. */ + if (!nx) + high_bits_rsvd |= rsvd_bits(63, 63); + /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. @@ -4070,45 +4077,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 63) | - rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PDE */ - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PTE */ - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | + high_bits_rsvd | + rsvd_bits(5, 8) | + rsvd_bits(1, 2); /* PDPTE */ + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: - rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: - rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | + gbpages_bit_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 29); - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | + gbpages_bit_rsvd | + rsvd_bits(13, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; @@ -4119,8 +4120,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), context->root_level, - context->nx, + vcpu->arch.reserved_gpa_bits, + context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), guest_cpuid_is_amd_or_hygon(vcpu)); @@ -4128,27 +4129,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, bool execonly) + u64 pa_bits_rsvd, bool execonly) { + u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 bad_mt_xwr; - rsvd_check->rsvd_bits_mask[0][4] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][3] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4167,7 +4163,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), execonly); + vcpu->arch.reserved_gpa_bits, execonly); } /* @@ -4197,7 +4193,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + rsvd_bits(shadow_phys_bits, 63), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4234,13 +4230,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + rsvd_bits(shadow_phys_bits, 63), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - shadow_phys_bits, + rsvd_bits(shadow_phys_bits, 63), false); if (!shadow_me_mask) @@ -4261,7 +4257,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - shadow_phys_bits, execonly); + rsvd_bits(shadow_phys_bits, 63), execonly); } #define BYTE_MASK(access) \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ed657b01cbd..c87b17205ff6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -827,8 +827,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { - return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | - rsvd_bits(1, 2); + return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* -- Gitee From 2444733e7eb1b0fe3d9ea4a1742b9680b722a6be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:20:16 +0800 Subject: [PATCH 030/158] KVM: x86/mmu: Add helper to generate mask of reserved HPA bits mainline inclusion from mainline-v5.12-rc1 commit 6f8e65a60168567cc59f9b99980ea9112d4152f5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f8e65a60168567cc59f9b99980ea9112d4152f5 ---------------------------------------------------------------------- Add a helper to generate the mask of reserved PA bits in the host. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-10-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1e5621cb9b3e..e7233a1738f5 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4166,6 +4166,11 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, vcpu->arch.reserved_gpa_bits, execonly); } +static inline u64 reserved_hpa_bits(void) +{ + return rsvd_bits(shadow_phys_bits, 63); +} + /* * the page table on host is the shadow page table for the page * table in guest or amd nested guest, its mmu features completely @@ -4193,7 +4198,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - rsvd_bits(shadow_phys_bits, 63), + reserved_hpa_bits(), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4230,14 +4235,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - rsvd_bits(shadow_phys_bits, 63), + reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - rsvd_bits(shadow_phys_bits, 63), - false); + reserved_hpa_bits(), false); if (!shadow_me_mask) return; @@ -4257,7 +4261,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - rsvd_bits(shadow_phys_bits, 63), execonly); + reserved_hpa_bits(), execonly); } #define BYTE_MASK(access) \ -- Gitee From 9e2c9cf1540ac4a12d89fb36089fbf688b41974f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:22:01 +0800 Subject: [PATCH 031/158] KVM: x86: Add helper to consolidate "raw" reserved GPA mask calculations mainline inclusion from mainline-v5.12-rc1 commit a8ac864a7d6dbc2fc43081b1eecd9e0183065d47 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a8ac864a7d6dbc2fc43081b1eecd9e0183065d47 ---------------------------------------------------------------------- Add a helper to generate the mask of reserved GPA bits _without_ any adjustments for repurposed bits, and use it to replace a variety of open coded variants in the MTRR and APIC_BASE flows. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210204000117.3303214-11-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/cpuid.c | 12 +++++++++++- arch/x86/kvm/cpuid.h | 1 + arch/x86/kvm/mtrr.c | 12 ++++++------ arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 06a20ff9c5be..2f67d137ff2f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -216,7 +216,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.reserved_gpa_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = @@ -270,6 +270,16 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) return 36; } +/* + * This "raw" version returns the reserved GPA bits without any adjustments for + * encryption technologies that usurp bits. The raw mask should be used if and + * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs. + */ +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b9871e4f1396..7ba00522c51c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -70,6 +70,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 7f0059aa30e1..a9356e97d0a2 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* variable MTRRs */ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); - mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { /* MTRR base */ if (!valid_mtrr_type(data & 0xff)) @@ -355,14 +355,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (var_mtrr_range_is_valid(cur)) list_del(&mtrr_state->var_ranges[index].node); - /* Extend the mask with all 1 bits to the left, since those - * bits must implicitly be 0. The bits are then cleared - * when reading them. + /* + * Set all illegal GPA bits in the mask, since those bits must + * implicitly be 0. The bits are then cleared when reading them. */ if (!is_mtrr_mask) cur->base = data; else - cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu)); + cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { @@ -430,7 +430,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; - *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1; + *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c87b17205ff6..396e32d81438 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -470,7 +470,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) @@ -10447,7 +10447,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.reserved_gpa_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- Gitee From 32390924f9efad4d6cec9a7100500f9f3a2d3492 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 6 Feb 2021 09:53:33 -0500 Subject: [PATCH 032/158] KVM: x86: compile out TDP MMU on 32-bit systems mainline inclusion from mainline-v5.12-rc1 commit 897218ff7cf19290ec2d69652ce673d8ed6fedeb category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=897218ff7cf19290ec2d69652ce673d8ed6fedeb ---------------------------------------------------------------------- The TDP MMU assumes that it can do atomic accesses to 64-bit PTEs. Rather than just disabling it, compile it out completely so that it is possible to use for example 64-bit xchg. To limit the number of stubs, wrap all accesses to tdp_mmu_enabled or tdp_mmu_page with a function. Calls to all other functions in tdp_mmu.c are eliminated and do not even reach the linker. Reviewed-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Paolo Bonzini conflict: arch/x86/include/asm/kvm_host.h arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/Makefile | 3 ++- arch/x86/kvm/mmu/mmu.c | 37 +++++++++++++++++---------------- arch/x86/kvm/mmu/mmu_internal.h | 2 ++ arch/x86/kvm/mmu/tdp_mmu.c | 29 +------------------------- arch/x86/kvm/mmu/tdp_mmu.h | 32 ++++++++++++++++++++++++---- 6 files changed, 54 insertions(+), 51 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f2433f21a435..84ad7fb973a8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1031,6 +1031,7 @@ struct kvm_arch { struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; +#ifdef CONFIG_X86_64 /* * Whether the TDP MMU is enabled for this VM. This contains a * snapshot of the TDP MMU module parameter from when the VM was @@ -1057,6 +1058,7 @@ struct kvm_arch { * the thread holds the MMU lock in write mode. */ spinlock_t tdp_mmu_pages_lock; +#endif /* CONFIG_X86_64 */ /* * VM-scope maximum vCPU ID. Used to determine the size of structures diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 1c6c9adcb730..3fc65dd3a33b 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o + mmu/spte.o +kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e7233a1738f5..02fd8322476c 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1222,7 +1222,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); while (mask) { @@ -1251,7 +1251,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); while (mask) { @@ -1298,7 +1298,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); @@ -1510,7 +1510,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); return r; @@ -1522,7 +1522,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); return r; @@ -1577,7 +1577,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) int young = false; young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); return young; @@ -1588,7 +1588,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) int young = false; young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_hva(kvm, hva); return young; @@ -3145,7 +3145,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; if (kvm_mmu_put_root(kvm, sp)) { - if (sp->tdp_mmu_page) + if (is_tdp_mmu_page(sp)) kvm_tdp_mmu_free_root(kvm, sp); else if (sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3239,7 +3239,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (vcpu->kvm->arch.tdp_mmu_enabled) { + if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); if (!VALID_PAGE(root)) @@ -5506,7 +5506,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); @@ -5569,7 +5569,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } - if (kvm->arch.tdp_mmu_enabled) { + if (is_tdp_mmu_enabled(kvm)) { flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); if (flush) kvm_flush_remote_tlbs(kvm); @@ -5593,7 +5593,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); write_unlock(&kvm->mmu_lock); @@ -5659,7 +5659,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); write_unlock(&kvm->mmu_lock); } @@ -5686,7 +5686,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); write_unlock(&kvm->mmu_lock); @@ -5709,7 +5709,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); write_unlock(&kvm->mmu_lock); @@ -5725,7 +5725,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); write_unlock(&kvm->mmu_lock); @@ -5753,7 +5753,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); @@ -6077,7 +6077,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - if (sp->tdp_mmu_page) { + + if (is_tdp_mmu_page(sp)) { flush |= kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 7f599cc64178..cf67fa6fb8fe 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -56,10 +56,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; +#ifdef CONFIG_X86_64 bool tdp_mmu_page; /* Used for freeing the page asyncronously if it is a TDP MMU page. */ struct rcu_head rcu_head; +#endif }; extern struct kmem_cache *mmu_page_header_cache; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 950eb0366c74..35a0113d8b70 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,24 +10,13 @@ #include #include -#ifdef CONFIG_X86_64 static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); -#endif - -static bool is_tdp_mmu_enabled(void) -{ -#ifdef CONFIG_X86_64 - return tdp_enabled && READ_ONCE(tdp_mmu_enabled); -#else - return false; -#endif /* CONFIG_X86_64 */ -} /* Initializes the TDP MMU for the VM, if enabled. */ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - if (!is_tdp_mmu_enabled()) + if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) return; /* This should not be changed for the lifetime of the VM. */ @@ -96,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, #define for_each_tdp_mmu_root(_kvm, _root) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) -{ - struct kvm_mmu_page *sp; - - if (!kvm->arch.tdp_mmu_enabled) - return false; - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; - - return sp->tdp_mmu_page && sp->root_count; -} - static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index a7a3f6db263d..1e2b32f76f99 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,10 +5,6 @@ #include -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); - -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); @@ -63,4 +59,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +#ifdef CONFIG_X86_64 +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } +#else +static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +#endif + +static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + if (!is_tdp_mmu_enabled(kvm)) + return false; + if (WARN_ON(!VALID_PAGE(hpa))) + return false; + + sp = to_shadow_page(hpa); + if (WARN_ON(!sp)) + return false; + + return is_tdp_mmu_page(sp) && sp->root_count; +} + #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- Gitee From 7b87502540b0a9dc02da58f478ff3f269aca3e3b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 28 Jan 2021 13:01:31 -0500 Subject: [PATCH 033/158] KVM: Raise the maximum number of user memslots mainline inclusion from mainline-v5.12-rc1 commit 4fc096a99e01dd06dc55bef76ade7f8d76653245 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4fc096a99e01dd06dc55bef76ade7f8d76653245 This commit also removes the risc-v's and loongarch's version of KVM_USER_MEM_SLOTS, which never appeared in the upstream code, yet was added in backporting commit e91613aec0b9 ("RISC-V: Add initial skeletal KVM support"), and commit 622f37d34174 ("LoongArch: kvm: add initial kvm support"). The reason should be due to lack of this current commit. So just also do it for risc-v and loongarch, to use the generic definition of KVM_USER_MEM_SLOTS. ---------------------------------------------------------------------- Current KVM_USER_MEM_SLOTS limits are arch specific (512 on Power, 509 on x86, 32 on s390, 16 on MIPS) but they don't really need to be. Memory slots are allocated dynamically in KVM when added so the only real limitation is 'id_to_index' array which is 'short'. We don't have any other KVM_MEM_SLOTS_NUM/KVM_USER_MEM_SLOTS-sized statically defined structures. Low KVM_USER_MEM_SLOTS can be a limiting factor for some configurations. In particular, when QEMU tries to start a Windows guest with Hyper-V SynIC enabled and e.g. 256 vCPUs the limit is hit as SynIC requires two pages per vCPU and the guest is free to pick any GFN for each of them, this fragments memslots as QEMU wants to have a separate memslot for each of these pages (which are supposed to act as 'overlay' pages). Signed-off-by: Vitaly Kuznetsov Message-Id: <20210127175731.2020089-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/arm64/include/asm/kvm_host.h | 1 - arch/loongarch/include/asm/kvm_host.h | 1 - arch/mips/include/asm/kvm_host.h | 1 - arch/powerpc/include/asm/kvm_host.h | 1 - arch/riscv/include/asm/kvm_host.h | 1 - arch/s390/include/asm/kvm_host.h | 1 - arch/x86/include/asm/kvm_host.h | 2 -- include/linux/kvm_host.h | 5 ++--- 8 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9cbd22aa0e1c..dc445f074c77 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -29,7 +29,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED -#define KVM_USER_MEM_SLOTS 512 #define KVM_HALT_POLL_NS_DEFAULT 500000 #include diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 8f181f8cfa8c..e036ca0dbe51 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -29,7 +29,6 @@ #define LOONGSON_VIRT_REG_BASE 0x1f000000 #define KVM_MAX_VCPUS 256 -#define KVM_USER_MEM_SLOTS 256 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 24f3d0f9996b..3a5612e7304c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -83,7 +83,6 @@ #define KVM_MAX_VCPUS 16 -#define KVM_USER_MEM_SLOTS 16 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d67a470e95a3..2b9b6855ec86 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -28,7 +28,6 @@ #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS -#define KVM_USER_MEM_SLOTS 512 #include diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index dc2666b4180b..0818705e3c2b 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -20,7 +20,6 @@ #define KVM_MAX_VCPUS (1U << 9) #endif -#define KVM_USER_MEM_SLOTS 512 #define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_VCPU_MAX_FEATURES 0 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 171913b9a925..4ca1252789e8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -28,7 +28,6 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 -#define KVM_USER_MEM_SLOTS 32 /* * These seem to be used for allocating ->chip in the routing table, which we diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 84ad7fb973a8..e4d2a7d4084e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,10 +40,8 @@ #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 -#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) #define KVM_HALT_POLL_NS_DEFAULT 200000 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 97407c065b3e..2fdf2c65e8fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -419,9 +419,8 @@ struct kvm_irq_routing_table { #define KVM_PRIVATE_MEM_SLOTS 0 #endif -#ifndef KVM_MEM_SLOTS_NUM -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#endif +#define KVM_MEM_SLOTS_NUM SHRT_MAX +#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS) #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) -- Gitee From f5af1e5fb43a0b23142ec5ebf50192f6d75b57d2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Feb 2021 11:45:21 +0900 Subject: [PATCH 034/158] KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault mainline inclusion from mainline-v5.12-rc1 commit 5f8a7cf25a7da5c2bbde25b3f0aca31459d20741 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f8a7cf25a7da5c2bbde25b3f0aca31459d20741 ---------------------------------------------------------------------- Don't retry a page fault due to an mmu_notifier invalidation when handling a page fault for a GPA that did not resolve to a memslot, i.e. an MMIO page fault. Invalidations from the mmu_notifier signal a change in a host virtual address (HVA) mapping; without a memslot, there is no HVA and thus no possibility that the invalidation is relevant to the page fault being handled. Note, the MMIO vs. memslot generation checks handle the case where a pending memslot will create a memslot overlapping the faulting GPA. The mmu_notifier checks are orthogonal to memslot updates. Signed-off-by: Sean Christopherson Message-Id: <20210222024522.1751719-2-stevensd@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 02fd8322476c..3c81d0b8f879 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3767,7 +3767,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 263b85384ef8..51dce186513b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -869,7 +869,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); -- Gitee From d56c9a1d95139156848f7d0e871dd078693b9098 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Mon, 22 Feb 2021 11:45:22 +0900 Subject: [PATCH 035/158] KVM: x86/mmu: Consider the hva in mmu_notifier retry mainline inclusion from mainline-v5.12-rc1 commit 4a42d848db9544e3108875390886dc490d9c101e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4a42d848db9544e3108875390886dc490d9c101e ---------------------------------------------------------------------- Track the range being invalidated by mmu_notifier and skip page fault retries if the fault address is not affected by the in-progress invalidation. Handle concurrent invalidations by finding the minimal range which includes all ranges being invalidated. Although the combined range may include unrelated addresses and cannot be shrunk as individual invalidation operations complete, it is unlikely the marginal gains of proper range tracking are worth the additional complexity. The primary benefit of this change is the reduction in the likelihood of extreme latency when handing a page fault due to another thread having been preempted while modifying host virtual addresses. Signed-off-by: David Stevens Message-Id: <20210222024522.1751719-3-stevensd@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 23 ++++++++++++++------ arch/x86/kvm/mmu/paging_tmpl.h | 14 ++++++++++--- include/linux/kvm_host.h | 25 +++++++++++++++++++++- virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++---- 6 files changed, 79 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 38ea396a23d6..8e06cd3f759c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, } else { /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok); + writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 04028f905e50..a6be53efc436 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -824,7 +824,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); + writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3c81d0b8f879..59e37aa42d42 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2740,6 +2740,13 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + __direct_pte_prefetch(vcpu, sp, sptep); } @@ -3687,8 +3694,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, + bool write, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3709,7 +3716,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, + write, writable, hva); if (!async) return false; /* *pfn has correct page already */ @@ -3723,7 +3731,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, + write, writable, hva); return false; } @@ -3736,6 +3745,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; + hva_t hva; int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) @@ -3754,7 +3764,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) @@ -3767,7 +3778,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 51dce186513b..389f38677837 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -600,6 +600,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.level > PG_LEVEL_4K) return; + /* + * If addresses are being invalidated, skip prefetching to avoid + * accidentally prefetching those addresses. + */ + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return; + if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); @@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; + hva_t hva; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; int max_level; @@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2fdf2c65e8fa..cf1f65c8c0bf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -500,6 +501,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + unsigned long mmu_notifier_range_start; + unsigned long mmu_notifier_range_end; #endif long tlbs_dirty; struct list_head devices; @@ -752,7 +755,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable); + bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); @@ -1253,6 +1256,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) return 1; return 0; } + +static inline int mmu_notifier_retry_hva(struct kvm *kvm, + unsigned long mmu_seq, + unsigned long hva) +{ + lockdep_assert_held(&kvm->mmu_lock); + /* + * If mmu_notifier_count is non-zero, then the range maintained by + * kvm_mmu_notifier_invalidate_range_start contains all addresses that + * might be being invalidated. Note that it may include some false + * positives, due to shortcuts when handing concurrent invalidations. + */ + if (unlikely(kvm->mmu_notifier_count) && + hva >= kvm->mmu_notifier_range_start && + hva < kvm->mmu_notifier_range_end) + return 1; + if (kvm->mmu_notifier_seq != mmu_seq) + return 1; + return 0; +} #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3ec37dd51a62..3c34d4b0a961 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -544,6 +544,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * count is also read inside the mmu_lock critical section. */ kvm->mmu_notifier_count++; + if (likely(kvm->mmu_notifier_count == 1)) { + kvm->mmu_notifier_range_start = range->start; + kvm->mmu_notifier_range_end = range->end; + } else { + /* + * Fully tracking multiple concurrent ranges has dimishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_notifier_range_start = + min(kvm->mmu_notifier_range_start, range->start); + kvm->mmu_notifier_range_end = + max(kvm->mmu_notifier_range_end, range->end); + } need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); /* we've to flush the tlb before the pages can be freed */ @@ -2114,10 +2132,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable) + bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + if (hva) + *hva = addr; + if (addr == KVM_HVA_ERR_RO_BAD) { if (writable) *writable = false; @@ -2145,19 +2166,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, - write_fault, writable); + write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); -- Gitee From b0e0ef734691eda76e90ec774666e1a52fa5fac3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Mar 2021 16:30:29 -0800 Subject: [PATCH 036/158] KVM: x86/mmu: Skip !MMU-present SPTEs when removing SP in exclusive mode mainline inclusion from mainline-v5.12-rc3 commit 8df9f1af2eced9720f71cf310275d81c1bf07a06 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8df9f1af2eced9720f71cf310275d81c1bf07a06 ---------------------------------------------------------------------- If mmu_lock is held for write, don't bother setting !PRESENT SPTEs to REMOVED_SPTE when recursively zapping SPTEs as part of shadow page removal. The concurrent write protections provided by REMOVED_SPTE are not needed, there are no backing page side effects to record, and MMIO SPTEs can be left as is since they are protected by the memslot generation, not by ensuring that the MMIO SPTE is unreachable (which is racy with respect to lockless walks regardless of zapping behavior). Skipping !PRESENT drastically reduces the number of updates needed to tear down sparsely populated MMUs, e.g. when tearing down a 6gb VM that didn't touch much memory, 6929/7168 (~96.6%) of SPTEs were '0' and could be skipped. Avoiding the write itself is likely close to a wash, but avoiding __handle_changed_spte() is a clear-cut win as that involves saving and restoring all non-volatile GPRs (it's a subtly big function), as well as several conditional branches before bailing out. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210310003029.1250571-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 35a0113d8b70..65cbe0fd12ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, cpu_relax(); } } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; /* * Marking the SPTE as a removed SPTE is not -- Gitee From db3f2f7b7e1b343db388aa163d9d646c24bf2005 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 18:23:32 +0800 Subject: [PATCH 037/158] KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page mainline inclusion from mainline-v5.12-rc4 commit 70fb3e41a97a5fecc0aedc9a429479d702c3ab66 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=70fb3e41a97a5fecc0aedc9a429479d702c3ab66 ---------------------------------------------------------------------- The pt passed into handle_removed_tdp_mmu_page does not need RCU protection, as it is not at any risk of being freed by another thread at that point. However, the implicit cast from tdp_sptep_t to u64 * dropped the __rcu annotation without a proper rcu_derefrence. Fix this by passing the pt as a tdp_ptep_t and then rcu_dereferencing it in the function. Suggested-by: Sean Christopherson Reported-by: kernel test robot Signed-off-by: Ben Gardon Message-Id: <20210315233803.2706477-2-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 65cbe0fd12ad..e1b7e651af4d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -301,11 +301,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. + * + * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU + * protection. Since this thread removed it from the paging structure, + * this thread will be responsible for ensuring the page is freed. Hence the + * early rcu_dereferences in the function. */ -static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, +static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, bool shared) { - struct kvm_mmu_page *sp = sptep_to_sp(pt); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; u64 old_child_spte; @@ -318,7 +323,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = pt + i; + sptep = rcu_dereference(pt) + i; gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); if (shared) { -- Gitee From 6bc2f95210c38092ab06436b96088a6f5effd871 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 18:25:28 +0800 Subject: [PATCH 038/158] KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs mainline inclusion from mainline-v5.12-rc4 commit 14f6fec2e8e04b83c87c339b8d8ff4cc62b23d35 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=14f6fec2e8e04b83c87c339b8d8ff4cc62b23d35 ---------------------------------------------------------------------- Fix a missing rcu_dereference in tdp_mmu_zap_spte_atomic. Reported-by: kernel test robot Signed-off-by: Ben Gardon Message-Id: <20210315233803.2706477-3-bgardon@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e1b7e651af4d..b763cf025833 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -543,7 +543,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * here since the SPTE is going from non-present * to non-present. */ - WRITE_ONCE(*iter->sptep, 0); + WRITE_ONCE(*rcu_dereference(iter->sptep), 0); return true; } -- Gitee From 3205877aaa710f07590afe21c5917d454ad4e7e3 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 14 Aug 2023 18:27:07 +0800 Subject: [PATCH 039/158] KVM: x86/mmu: Factor out tdp_iter_return_to_root mainline inclusion from mainline-v5.12-rc4 commit b601c3bc9d5053065acdaa1481c21481d0dc3f10 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b601c3bc9d5053065acdaa1481c21481d0dc3f10 ---------------------------------------------------------------------- In tdp_mmu_iter_cond_resched there is a call to tdp_iter_start which causes the iterator to continue its walk over the paging structure from the root. This is needed after a yield as paging structure could have been freed in the interim. The tdp_iter_start call is not very clear and something of a hack. It requires exposing tdp_iter fields not used elsewhere in tdp_mmu.c and the effect is not obvious from the function name. Factor a more aptly named function out of tdp_iter_start and call it from tdp_mmu_iter_cond_resched and tdp_iter_start. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20210315233803.2706477-4-bgardon@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_iter.c | 24 +++++++++++++++++------- arch/x86/kvm/mmu/tdp_iter.h | 1 + arch/x86/kvm/mmu/tdp_mmu.c | 4 +--- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e5f148106e20..f7f94ea65243 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -20,6 +20,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) return gfn & -KVM_PAGES_PER_HPAGE(level); } +/* + * Return the TDP iterator to the root PT and allow it to continue its + * traversal over the paging structure from there. + */ +void tdp_iter_restart(struct tdp_iter *iter) +{ + iter->yielded_gfn = iter->next_last_level_gfn; + iter->level = iter->root_level; + + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + /* * Sets a TDP iterator to walk a pre-order traversal of the paging structure * rooted at root_pt, starting with the walk to translate next_last_level_gfn. @@ -31,16 +46,11 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); iter->next_last_level_gfn = next_last_level_gfn; - iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; - iter->level = root_level; - iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; + iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); - tdp_iter_refresh_sptep(iter); - - iter->valid = true; + tdp_iter_restart(iter); } /* diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 4cc177d75c4a..8eb424d17c91 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -63,5 +63,6 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); +void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index b763cf025833..a43f21b36e11 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -664,9 +664,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, - iter->next_last_level_gfn); + tdp_iter_restart(iter); return true; } -- Gitee From 9be4591b4b40fce0b012a8c35dec98e81e7bee29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Aug 2023 18:28:38 +0800 Subject: [PATCH 040/158] KVM: x86/mmu: Store the address space ID in the TDP iterator mainline inclusion from mainline-v5.12-rc4 commit 08889894cc82bc3b213bdb192f274358e5a6b78d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=08889894cc82bc3b213bdb192f274358e5a6b78d ---------------------------------------------------------------------- Store the address space ID in the TDP iterator so that it can be retrieved without having to bounce through the root shadow page. This streamlines the code and fixes a Sparse warning about not properly using rcu_dereference() when grabbing the ID from the root on the fly. Reported-by: kernel test robot Signed-off-by: Sean Christopherson Signed-off-by: Ben Gardon Message-Id: <20210315233803.2706477-5-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 5 +++++ arch/x86/kvm/mmu/tdp_iter.c | 6 +----- arch/x86/kvm/mmu/tdp_iter.h | 3 ++- arch/x86/kvm/mmu/tdp_mmu.c | 23 +++++------------------ 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index cf67fa6fb8fe..1d0ff532e9e0 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) { /* diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index f7f94ea65243..b3ed302c1a35 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -49,6 +49,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, iter->root_level = root_level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt; + iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt)); tdp_iter_restart(iter); } @@ -169,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) -{ - return iter->pt_path[iter->root_level - 1]; -} - diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 8eb424d17c91..b1748b988d3a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -36,6 +36,8 @@ struct tdp_iter { int min_level; /* The iterator's current level within the paging structure */ int level; + /* The address space ID, i.e. SMM vs. regular. */ + int as_id; /* A snapshot of the value at sptep */ u64 old_spte; /* @@ -62,7 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a43f21b36e11..146a7f993125 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) -{ - return sp->role.smm ? 1 : 0; -} - static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); @@ -497,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - u64 *root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -514,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, true); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return true; } @@ -569,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - tdp_ptep_t root_pt = tdp_iter_root_pt(iter); - struct kvm_mmu_page *root = sptep_to_sp(root_pt); - int as_id = kvm_mmu_page_as_id(root); - lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -586,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); - __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level, false); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level); } -- Gitee From 6446cae250bbf02db1699b551dbce7361485e51d Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 8 Feb 2021 19:51:32 +0100 Subject: [PATCH 041/158] KVM: x86/mmu: Make HVA handler retpoline-friendly mainline inclusion from mainline-v5.12-rc1 commit 8f5c44f953d36f8c1aea6d57eb3251e3640f4dad category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8f5c44f953d36f8c1aea6d57eb3251e3640f4dad ---------------------------------------------------------------------- When retpolines are enabled they have high overhead in the inner loop inside kvm_handle_hva_range() that iterates over the provided memory area. Let's mark this function and its TDP MMU equivalent __always_inline so compiler will be able to change the call to the actual handler function inside each of them into a direct one. This significantly improves performance on the unmap test on the existing kernel memslot code (tested on a Xeon 8167M machine): 30 slots in use: Test Before After Improvement Unmap 0.0353s 0.0334s 5% Unmap 2M 0.00104s 0.000407s 61% 509 slots in use: Test Before After Improvement Unmap 0.0742s 0.0740s None Unmap 2M 0.00221s 0.00159s 28% Looks like having an indirect call in these functions (and, so, a retpoline) might have interfered with unrolling of the whole loop in the CPU. Signed-off-by: Maciej S. Szmigiero Message-Id: <732d3fe9eb68aa08402a638ab0309199fa89ae56.1612810129.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 21 +++++++++++---------- arch/x86/kvm/mmu/tdp_mmu.c | 16 +++++++++++----- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59e37aa42d42..e5ffe3ba98a6 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1445,16 +1445,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +static __always_inline int +kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, + gfn_t gfn, + int level, + unsigned long data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 146a7f993125..bbf3d6560590 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -882,11 +882,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return ret; } -static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned long data, - int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long data)) +static __always_inline int +kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, + gfn_t start, + gfn_t end, + unsigned long data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; -- Gitee From 54e3f430ba7720cd3c25388708ac6e6ec134102e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:47 -0800 Subject: [PATCH 042/158] KVM: x86/mmu: Capture 'mmu' in a local variable when allocating roots mainline inclusion from mainline-v5.13-rc1 commit b37233c911cbecd22a8a2a80137efe706c727d76 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b37233c911cbecd22a8a2a80137efe706c727d76 ---------------------------------------------------------------------- Grab 'mmu' and do s/vcpu->arch.mmu/mmu to shorten line lengths and yield smaller diffs when moving code around in future cleanup without forcing the new code to use the same ugly pattern. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-4-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 74 ++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e5ffe3ba98a6..f7961c729d62 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3243,7 +3243,8 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { - u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u8 shadow_root_level = mmu->shadow_root_level; hpa_t root; unsigned i; @@ -3252,42 +3253,43 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root)) return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->root_hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); if (!VALID_PAGE(root)) return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); + MMU_WARN_ON(VALID_PAGE(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); if (!VALID_PAGE(root)) return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; + mmu->pae_root[i] = root | PT_PRESENT_MASK; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); + mmu->root_hpa = __pa(mmu->pae_root); } else BUG(); /* root_pgd is ignored for direct MMUs. */ - vcpu->arch.mmu->root_pgd = 0; + mmu->root_pgd = 0; return 0; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptr, pm_mask; gfn_t root_gfn, root_pgd; hpa_t root; int i; - root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) @@ -3297,14 +3299,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); + if (mmu->root_level >= PT64_ROOT_4LEVEL) { + MMU_WARN_ON(VALID_PAGE(mmu->root_hpa)); root = mmu_alloc_root(vcpu, root_gfn, 0, - vcpu->arch.mmu->shadow_root_level, false); + mmu->shadow_root_level, false); if (!VALID_PAGE(root)) return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->root_hpa = root; goto set_root_pgd; } @@ -3314,7 +3316,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; /* @@ -3322,21 +3324,21 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * with 64-bit only when needed. Unlike 32-bit NPT, it doesn't * need to be in low mem. See also pml4_root below. */ - if (!vcpu->arch.mmu->pae_root) { + if (!mmu->pae_root) { WARN_ON_ONCE(!tdp_enabled); - vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mmu->pae_root) + mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!mmu->pae_root) return -ENOMEM; } } for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); - if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); + MMU_WARN_ON(VALID_PAGE(mmu->pae_root[i])); + if (mmu->root_level == PT32E_ROOT_LEVEL) { + pdptr = mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu->pae_root[i] = 0; + mmu->pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; @@ -3348,9 +3350,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) PT32_ROOT_LEVEL, false); if (!VALID_PAGE(root)) return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | pm_mask; + mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); + mmu->root_hpa = __pa(mmu->pae_root); /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3359,51 +3361,51 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * on demand, as running a 32-bit L1 VMM is very rare. The PDP is * handled above (to share logic with PAE), deal with the PML4 here. */ - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu->pml4_root == NULL) { + if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (mmu->pml4_root == NULL) { u64 *pml4_root; pml4_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml4_root) return -ENOMEM; - pml4_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + pml4_root[0] = __pa(mmu->pae_root) | pm_mask; - vcpu->arch.mmu->pml4_root = pml4_root; + mmu->pml4_root = pml4_root; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pml4_root); + mmu->root_hpa = __pa(mmu->pml4_root); } #ifdef CONFIG_X86_64 - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_5LEVEL) { - if (vcpu->arch.mmu->pml4_root == NULL) { + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (mmu->pml4_root == NULL) { u64 *pml4_root; pml4_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml4_root) return -ENOMEM; - pml4_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + pml4_root[0] = __pa(mmu->pae_root) | pm_mask; - vcpu->arch.mmu->pml4_root = pml4_root; + mmu->pml4_root = pml4_root; } - if (vcpu->arch.mmu->pml5_root == NULL) { + if (mmu->pml5_root == NULL) { u64 *pml5_root; pml5_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml5_root) return -ENOMEM; - pml5_root[0] = __pa(vcpu->arch.mmu->pml4_root) | pm_mask; + pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; - vcpu->arch.mmu->pml5_root = pml5_root; + mmu->pml5_root = pml5_root; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pml5_root); + mmu->root_hpa = __pa(vcpu->arch.mmu->pml5_root); } #endif set_root_pgd: - vcpu->arch.mmu->root_pgd = root_pgd; + mmu->root_pgd = root_pgd; return 0; } -- Gitee From 2a203305b8d0eb4fad9cc6cca7144b88f34da33d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:48 -0800 Subject: [PATCH 043/158] KVM: x86/mmu: Allocate the lm_root before allocating PAE roots mainline inclusion from mainline-v5.13-rc1 commit ba0a194ffbfb4168a277fb2116e8362013e2078f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ba0a194ffbfb4168a277fb2116e8362013e2078f ---------------------------------------------------------------------- Allocate lm_root before the PAE roots so that the PAE roots aren't leaked if the memory allocation for the lm_root happens to fail. Note, KVM can still leak PAE roots if mmu_check_root() fails on a guest's PDPTR, or if mmu_alloc_root() fails due to MMU pages not being available. Those issues will be fixed in future commits. Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-5-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 96 ++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 55 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7961c729d62..b3336c75332b 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3316,20 +3316,36 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - /* - * Allocate the page for the PDPTEs when shadowing 32-bit NPT - * with 64-bit only when needed. Unlike 32-bit NPT, it doesn't - * need to be in low mem. See also pml4_root below. - */ - if (!mmu->pae_root) { - WARN_ON_ONCE(!tdp_enabled); + /* + * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP + * tables are allocated and initialized at root creation as there is no + * equivalent level in the guest's NPT to shadow. Allocate the tables + * on demand, as running a 32-bit L1 VMM is very rare. Unlike 32-bit + * NPT, the PDP table doesn't need to be in low mem. Preallocate the + * pages so that the PAE roots aren't leaked on failure. + */ + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + (!mmu->pae_root)) { + u64 *pml4_root, *pae_root; + + pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pae_root) + return -ENOMEM; - mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!mmu->pae_root) + if (!mmu->pml4_root) { + pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml4_root) { + free_page((unsigned long)pae_root); return -ENOMEM; + } + + mmu->pae_root = pae_root; + mmu->pml4_root = pml4_root; + + pml4_root[0] = __pa(mmu->pae_root) | pm_mask; } } @@ -3352,58 +3368,28 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return -ENOSPC; mmu->pae_root[i] = root | pm_mask; } - mmu->root_hpa = __pa(mmu->pae_root); - - /* - * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP - * tables are allocated and initialized at MMU creation as there is no - * equivalent level in the guest's NPT to shadow. Allocate the tables - * on demand, as running a 32-bit L1 VMM is very rare. The PDP is - * handled above (to share logic with PAE), deal with the PML4 here. - */ - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { - if (mmu->pml4_root == NULL) { - u64 *pml4_root; - pml4_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) - return -ENOMEM; - - pml4_root[0] = __pa(mmu->pae_root) | pm_mask; - - mmu->pml4_root = pml4_root; - } - - mmu->root_hpa = __pa(mmu->pml4_root); - } #ifdef CONFIG_X86_64 - if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { - if (mmu->pml4_root == NULL) { - u64 *pml4_root; - - pml4_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) - return -ENOMEM; - - pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL && + mmu->pml5_root == NULL) { + u64 *pml5_root; - mmu->pml4_root = pml4_root; - } - if (mmu->pml5_root == NULL) { - u64 *pml5_root; - - pml5_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml5_root) - return -ENOMEM; - - pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + pml5_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + return -ENOMEM; - mmu->pml5_root = pml5_root; - } - mmu->root_hpa = __pa(vcpu->arch.mmu->pml5_root); + pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + mmu->pml5_root = pml5_root; } #endif + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + mmu->root_hpa = __pa(mmu->pml4_root); + else + mmu->root_hpa = __pa(mmu->pae_root); + set_root_pgd: mmu->root_pgd = root_pgd; -- Gitee From cd414417fdf673c6a5fa3fc7fc9c911dfd67b5fd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:49 -0800 Subject: [PATCH 044/158] KVM: x86/mmu: Allocate pae_root and lm_root pages in dedicated helper mainline inclusion from mainline-v5.13-rc1 commit 748e52b9b7368017d3fccb486914804ed4577b42 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=748e52b9b7368017d3fccb486914804ed4577b42 ---------------------------------------------------------------------- Move the on-demand allocation of the pae_root and lm_root pages, used by nested NPT for 32-bit L1s, into a separate helper. This will allow a future patch to hold mmu_lock while allocating the non-special roots so that make_mmu_pages_available() can be checked once at the start of root allocation, and thus avoid having to deal with failure in the middle of root allocation. Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-6-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 109 +++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b3336c75332b..b212c927feb0 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3316,37 +3316,13 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - /* - * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP - * tables are allocated and initialized at root creation as there is no - * equivalent level in the guest's NPT to shadow. Allocate the tables - * on demand, as running a 32-bit L1 VMM is very rare. Unlike 32-bit - * NPT, the PDP table doesn't need to be in low mem. Preallocate the - * pages so that the PAE roots aren't leaked on failure. - */ - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - (!mmu->pae_root)) { - u64 *pml4_root, *pae_root; - - pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pae_root) - return -ENOMEM; - - if (!mmu->pml4_root) { - pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) { - free_page((unsigned long)pae_root); - return -ENOMEM; - } + mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; - mmu->pae_root = pae_root; - mmu->pml4_root = pml4_root; - - pml4_root[0] = __pa(mmu->pae_root) | pm_mask; - } + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; } for (i = 0; i < 4; ++i) { @@ -3369,20 +3345,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } -#ifdef CONFIG_X86_64 - if (mmu->shadow_root_level == PT64_ROOT_5LEVEL && - mmu->pml5_root == NULL) { - u64 *pml5_root; - - pml5_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml5_root) - return -ENOMEM; - - pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; - mmu->pml5_root = pml5_root; - } -#endif - if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) mmu->root_hpa = __pa(mmu->pml5_root); else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) @@ -3396,6 +3358,66 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return 0; } +static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; + + + /* + * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP + * tables are allocated and initialized at root creation as there is no + * equivalent level in the guest's NPT to shadow. Allocate the tables + * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. + */ + if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || + mmu->shadow_root_level < PT64_ROOT_4LEVEL) + return 0; + + if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root) + return 0; + + /* + * The special roots should always be allocated in concert. Yell and + * bail if KVM ends up in a state where only one of the roots is valid. + */ + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + mmu->pml5_root)) + return -EIO; + + /* Unlike 32-bit NPT, the PDP table doesn't need to be in low mem. */ + pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pae_root) + return -ENOMEM; + +#ifdef CONFIG_X86_64 + if (!pml4_root) + goto err_pml4; + + if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; + } +#endif + + mmu->pae_root = pae_root; + mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; + + return 0; + +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif +} + static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { if (vcpu->arch.mmu->direct_map) @@ -4886,6 +4908,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) int r; r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); + if (r) + goto out; + r = mmu_alloc_special_roots(vcpu); if (r) goto out; r = mmu_alloc_roots(vcpu); -- Gitee From 05e2f6853fb9db8cb354e780ca4dfa3f926fa6fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:50 -0800 Subject: [PATCH 045/158] KVM: x86/mmu: Ensure MMU pages are available when allocating roots mainline inclusion from mainline-v5.13-rc1 commit 6e6ec58485746eb64487bd49bf5cd90ded3d2cf6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6e6ec58485746eb64487bd49bf5cd90ded3d2cf6 ---------------------------------------------------------------------- Hold the mmu_lock for write for the entire duration of allocating and initializing an MMU's roots. This ensures there are MMU pages available and thus prevents root allocations from failing. That in turn fixes a bug where KVM would fail to free valid PAE roots if a one of the later roots failed to allocate. Add a comment to make_mmu_pages_available() to call out that the limit is a soft limit, e.g. KVM will temporarily exceed the threshold if a page fault allocates multiple shadow pages and there was only one page "available". Note, KVM _still_ leaks the PAE roots if the guest PDPTR checks fail. This will be addressed in a future commit. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-7-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 50 +++++++++++++++----------------------- arch/x86/kvm/mmu/tdp_mmu.c | 23 ++++-------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b212c927feb0..414aa27b5b70 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2442,6 +2442,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); + /* + * Note, this check is intentionally soft, it only guarantees that one + * page is available, while the caller may end up allocating as many as + * four pages, e.g. for PAE roots or for 5-level paging. Temporarily + * exceeding the (arbitrary by default) limit will not harm the host, + * being too agressive may unnecessarily kill the guest, and getting an + * exact count is far more trouble than it's worth, especially in the + * page fault paths. + */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; @@ -3228,16 +3237,9 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - write_lock(&vcpu->kvm->mmu_lock); - - if (make_mmu_pages_available(vcpu)) { - write_unlock(&vcpu->kvm->mmu_lock); - return INVALID_PAGE; - } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } @@ -3250,16 +3252,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - - if (!VALID_PAGE(root)) - return -ENOSPC; mmu->root_hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, - true); - - if (!VALID_PAGE(root)) - return -ENOSPC; + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { @@ -3267,8 +3262,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); - if (!VALID_PAGE(root)) - return -ENOSPC; mmu->pae_root[i] = root | PT_PRESENT_MASK; } mmu->root_hpa = __pa(mmu->pae_root); @@ -3304,8 +3297,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->shadow_root_level, false); - if (!VALID_PAGE(root)) - return -ENOSPC; mmu->root_hpa = root; goto set_root_pgd; } @@ -3327,6 +3318,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { MMU_WARN_ON(VALID_PAGE(mmu->pae_root[i])); + if (mmu->root_level == PT32E_ROOT_LEVEL) { pdptr = mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { @@ -3340,8 +3332,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, false); - if (!VALID_PAGE(root)) - return -ENOSPC; mmu->pae_root[i] = root | pm_mask; } @@ -3418,14 +3408,6 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) #endif } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu->direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -4913,7 +4895,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_alloc_special_roots(vcpu); if (r) goto out; - r = mmu_alloc_roots(vcpu); + write_lock(&vcpu->kvm->mmu_lock); + if (make_mmu_pages_available(vcpu)) + r = -ENOSPC; + else if (vcpu->arch.mmu->direct_map) + r = mmu_alloc_direct_roots(vcpu); + else + r = mmu_alloc_shadow_roots(vcpu); + write_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_sync_roots(vcpu); if (r) goto out; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index bbf3d6560590..1e7f54734e26 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -137,22 +137,21 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, return sp; } -static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { union kvm_mmu_page_role role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + lockdep_assert_held_write(&kvm->mmu_lock); - write_lock(&kvm->mmu_lock); + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); - write_unlock(&kvm->mmu_lock); - return root; + goto out; } } @@ -161,19 +160,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) list_add(&root->link, &kvm->arch.tdp_mmu_roots); - write_unlock(&kvm->mmu_lock); - - return root; -} - -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *root; - - root = get_tdp_mmu_vcpu_root(vcpu); - if (!root) - return INVALID_PAGE; - +out: return __pa(root->spt); } -- Gitee From 737de5b3127cd3f0786f84b319a7bbe94e45fdf9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:51 -0800 Subject: [PATCH 046/158] KVM: x86/mmu: Check PDPTRs before allocating PAE roots mainline inclusion from mainline-v5.13-rc1 commit 6e0918aec49a5f89ca22c60c60cb5d20d8c9af29 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6e0918aec49a5f89ca22c60c60cb5d20d8c9af29 ---------------------------------------------------------------------- Check the validity of the PDPTRs before allocating any of the PAE roots, otherwise a bad PDPTR will cause KVM to leak any previously allocated roots. Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-8-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 414aa27b5b70..9870ca858e0e 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3277,7 +3277,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 pdptr, pm_mask; + u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; hpa_t root; int i; @@ -3288,6 +3288,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu_check_root(vcpu, root_gfn)) return 1; + if (mmu->root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + pdptrs[i] = mmu->get_pdptr(vcpu, i); + if (!(pdptrs[i] & PT_PRESENT_MASK)) + continue; + + if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) + return 1; + } + } + /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. @@ -3320,14 +3331,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(mmu->pae_root[i])); if (mmu->root_level == PT32E_ROOT_LEVEL) { - pdptr = mmu->get_pdptr(vcpu, i); - if (!(pdptr & PT_PRESENT_MASK)) { + if (!(pdptrs[i] & PT_PRESENT_MASK)) { mmu->pae_root[i] = 0; continue; } - root_gfn = pdptr >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + root_gfn = pdptrs[i] >> PAGE_SHIFT; } root = mmu_alloc_root(vcpu, root_gfn, i << 30, -- Gitee From 7589ea8eff8f86a901af62d8d9de5ed4ba017a89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:52 -0800 Subject: [PATCH 047/158] KVM: x86/mmu: Fix and unconditionally enable WARNs to detect PAE leaks mainline inclusion from mainline-v5.13-rc1 commit e49e0b7bf370ebed369f7f0466f349aac5ff12f1 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e49e0b7bf370ebed369f7f0466f349aac5ff12f1 ---------------------------------------------------------------------- Exempt NULL PAE roots from the check to detect leaks, since kvm_mmu_free_roots() doesn't set them back to INVALID_PAGE. Stop hiding the WARNs to detect PAE root leaks behind MMU_WARN_ON, the hidden WARNs obviously didn't do their job given the hilarious number of bugs that could lead to PAE roots being leaked, not to mention the above false positive. Opportunistically delete a warning on root_hpa being valid, there's nothing special about 4/5-level shadow pages that warrants a WARN. Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-9-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9870ca858e0e..53f29efc5d80 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3258,7 +3258,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(mmu->pae_root[i])); + WARN_ON_ONCE(mmu->pae_root[i] && + VALID_PAGE(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); @@ -3304,8 +3305,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * write-protect the guests page table root. */ if (mmu->root_level >= PT64_ROOT_4LEVEL) { - MMU_WARN_ON(VALID_PAGE(mmu->root_hpa)); - root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->shadow_root_level, false); mmu->root_hpa = root; @@ -3328,7 +3327,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(mmu->pae_root[i])); + WARN_ON_ONCE(mmu->pae_root[i] && VALID_PAGE(mmu->pae_root[i])); if (mmu->root_level == PT32E_ROOT_LEVEL) { if (!(pdptrs[i] & PT_PRESENT_MASK)) { -- Gitee From e995736f7118c89a0804271c8484831f9c42b94f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:10:54 -0800 Subject: [PATCH 048/158] KVM: x86/mmu: Set the C-bit in the PDPTRs and LM pseudo-PDPTRs mainline inclusion from mainline-v5.13-rc1 commit 17e368d94af77c1533bfd4136e080a33a6330282 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=17e368d94af77c1533bfd4136e080a33a6330282 ---------------------------------------------------------------------- Set the C-bit in SPTEs that are set outside of the normal MMU flows, specifically the PDPDTRs and the handful of special cased "LM root" entries, all of which are shadow paging only. Note, the direct-mapped-root PDPTR handling is needed for the scenario where paging is disabled in the guest, in which case KVM uses a direct mapped MMU even though TDP is disabled. Fixes: d0ec49d4de90 ("kvm/x86/svm: Support Secure Memory Encryption within KVM") Cc: stable@vger.kernel.org Cc: Brijesh Singh Cc: Tom Lendacky Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-11-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 53f29efc5d80..fbc2fadab8db 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3263,7 +3263,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); - mmu->pae_root[i] = root | PT_PRESENT_MASK; + mmu->pae_root[i] = root | PT_PRESENT_MASK | + shadow_me_mask; } mmu->root_hpa = __pa(mmu->pae_root); } else @@ -3316,7 +3317,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ - pm_mask = PT_PRESENT_MASK; + pm_mask = PT_PRESENT_MASK | shadow_me_mask; if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; -- Gitee From 45dde1fd94d9fa9c15ea3096b827dc4a0aa5e6b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:11:00 -0800 Subject: [PATCH 049/158] KVM: x86/mmu: Sync roots after MMU load iff load as successful mainline inclusion from mainline-v5.13-rc1 commit a91f387b4bfedab595a987ad41e75d8839a958bf category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a91f387b4bfedab595a987ad41e75d8839a958bf ---------------------------------------------------------------------- For clarity, explicitly skip syncing roots if the MMU load failed instead of relying on the !VALID_PAGE check in kvm_mmu_sync_roots(). Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-17-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fbc2fadab8db..09272954d173 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4911,10 +4911,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) else r = mmu_alloc_shadow_roots(vcpu); write_unlock(&vcpu->kvm->mmu_lock); - - kvm_mmu_sync_roots(vcpu); if (r) goto out; + + kvm_mmu_sync_roots(vcpu); + kvm_mmu_load_pgd(vcpu); kvm_x86_ops.tlb_flush_current(vcpu); out: -- Gitee From d43fe908e4974d265b8727331dc030424fd2f559 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 17:11:01 -0800 Subject: [PATCH 050/158] KVM: x86/mmu: WARN on NULL pae_root or lm_root, or bad shadow root level mainline inclusion from mainline-v5.13-rc1 commit 73ad160693dc3baf230d76cf44c3207defad6e21 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73ad160693dc3baf230d76cf44c3207defad6e21 ---------------------------------------------------------------------- WARN if KVM is about to dereference a NULL pae_root or lm_root when loading an MMU, and convert the BUG() on a bad shadow_root_level into a WARN (now that errors are handled cleanly). With nested NPT, botching the level and sending KVM down the wrong path is all too easy, and the on-demand allocation of pae_root and lm_root means bugs crash the host. Obviously, KVM could unconditionally allocate the roots, but that's arguably a worse failure mode as it would potentially corrupt the guest instead of crashing it. Signed-off-by: Sean Christopherson Message-Id: <20210305011101.3597423-18-seanjc@google.com> Signed-off-by: Paolo Bonzini conflicts: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 09272954d173..09663d185b4b 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3257,6 +3257,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { + if (WARN_ON_ONCE(!mmu->pae_root)) + return -EIO; + for (i = 0; i < 4; ++i) { WARN_ON_ONCE(mmu->pae_root[i] && VALID_PAGE(mmu->pae_root[i])); @@ -3267,8 +3270,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) shadow_me_mask; } mmu->root_hpa = __pa(mmu->pae_root); - } else - BUG(); + } else { + WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); + return -EIO; + } /* root_pgd is ignored for direct MMUs. */ mmu->root_pgd = 0; @@ -3312,6 +3317,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) goto set_root_pgd; } + if (WARN_ON_ONCE(!mmu->pae_root)) + return -EIO; + /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that @@ -3321,6 +3329,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + if (WARN_ON_ONCE(!mmu->pml4_root)) + return -EIO; + mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) -- Gitee From bc792d06914b0085d779a30cec41a2abfcb97aa7 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 25 Feb 2021 17:41:33 +0200 Subject: [PATCH 051/158] KVM: x86: mmu: initialize fault.async_page_fault in walk_addr_generic mainline inclusion from mainline-v5.13-rc1 commit 422e2e17066ca04515e159c42570a3521d83d30b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=422e2e17066ca04515e159c42570a3521d83d30b ---------------------------------------------------------------------- This field was left uninitialized by a mistake. Signed-off-by: Maxim Levitsky Message-Id: <20210225154135.405125-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/paging_tmpl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 389f38677837..51687648e7e5 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -502,6 +502,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, #endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + walker->fault.async_page_fault = false; trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; -- Gitee From 8f520be4bfb71b1e25c25fd04f1c3f898d8ceeee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 17:03:25 -0800 Subject: [PATCH 052/158] KVM: x86/mmu: Remove spurious TLB flush from TDP MMU's change_pte() hook mainline inclusion from mainline-v5.13-rc1 commit f055ab634c838a5f9d6c352c2d6d6a9042918ee9 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f055ab634c838a5f9d6c352c2d6d6a9042918ee9 ---------------------------------------------------------------------- Remove an unnecessary remote TLB flush from set_tdp_spte(), the TDP MMu's hook for handling change_pte() invocations from the MMU notifier. If the new host PTE is writable, the flush is completely redundant as there are no futher changes to the SPTE before the post-loop flush. If the host PTE is read-only, then the primary MMU is responsible for ensuring that the contents of the old and new pages are identical, thus it's safe to let the guest continue reading both the old and new pages. KVM must only ensure the old page cannot be referenced after returning from its callback; this is handled by the post-loop flush. Fixes: 1d8dd6b3f12b ("kvm: x86/mmu: Support changed pte notifier in tdp MMU") Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210226010329.1766033-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1e7f54734e26..aa9dd9ab461c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1034,10 +1034,14 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, if (!is_shadow_present_pte(iter.old_spte)) break; + /* + * Note, when changing a read-only SPTE, it's not strictly + * necessary to zero the SPTE before setting the new PFN, but + * doing so preserves the invariant that the PFN of a present + * leaf SPTE can never change. See __handle_changed_spte(). + */ tdp_mmu_set_spte(kvm, &iter, 0); - kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); - if (!pte_write(*ptep)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte( iter.old_spte, new_pfn); -- Gitee From 0dbc23cbfae9c77c803965659ce57e77a0b4db32 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 17:03:26 -0800 Subject: [PATCH 053/158] KVM: x86/mmu: WARN if TDP MMU's set_tdp_spte() sees multiple GFNs mainline inclusion from mainline-v5.13-rc1 commit 74fe0f547454a19a033b03ac55cf248e28f11db6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=74fe0f547454a19a033b03ac55cf248e28f11db6 ---------------------------------------------------------------------- WARN if set_tdp_spte() is invoked with multipel GFNs. It is specifically a callback to handle a single host PTE being changed. Consuming the @end parameter also eliminates the confusing 'unused' parameter. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210226010329.1766033-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index aa9dd9ab461c..a6a8fc194b1b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1012,7 +1012,7 @@ int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) * Returns non-zero if a flush is needed before releasing the MMU lock. */ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t end, unsigned long data) { struct tdp_iter iter; @@ -1023,7 +1023,7 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, rcu_read_lock(); - WARN_ON(pte_huge(*ptep)); + WARN_ON(pte_huge(*ptep) || (gfn + 1) != end); new_pfn = pte_pfn(*ptep); -- Gitee From 217a28b9e7ae9cd43e2d86507d2b4bec9a6b7c1c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 17:03:27 -0800 Subject: [PATCH 054/158] KVM: x86/mmu: Use 'end' param in TDP MMU's test_age_gfn() mainline inclusion from mainline-v5.13-rc1 commit e12b785e52fc26d5456b16d5a6e1968cd1deab41 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e12b785e52fc26d5456b16d5a6e1968cd1deab41 ---------------------------------------------------------------------- Use the @end param when aging a GFN instead of hardcoding the walk to a single GFN. Unlike tdp_set_spte(), which simply cannot work with more than one GFN, aging multiple GFNs would not break, though admittedly it would be weird. Be nice to the casual reader and don't make them puzzle out why the end GFN is unused. No functional change intended. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210226010329.1766033-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a6a8fc194b1b..947c5555f325 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -987,12 +987,12 @@ int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, } static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, - unsigned long unused2) + struct kvm_mmu_page *root, gfn_t gfn, gfn_t end, + unsigned long unused) { struct tdp_iter iter; - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + tdp_root_for_each_leaf_pte(iter, root, gfn, end) if (is_accessed_spte(iter.old_spte)) return 1; -- Gitee From 81bb8ef82ecddc91c6ebfeab5801b108f32eacff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 17:03:28 -0800 Subject: [PATCH 055/158] KVM: x86/mmu: Add typedefs for rmap/iter handlers mainline inclusion from mainline-v5.13-rc1 commit c1b91493ed31cd73fec7f3d385b00b4d42d59349 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c1b91493ed31cd73fec7f3d385b00b4d42d59349 ---------------------------------------------------------------------- Add typedefs for the MMU handlers that are invoked when walking the MMU SPTEs (rmaps in legacy MMU) to act on a host virtual address range. No functional change intended. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210226010329.1766033-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 27 ++++++++++----------------- arch/x86/kvm/mmu/tdp_mmu.c | 20 +++++++++----------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 09663d185b4b..98d19476215f 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1445,17 +1445,15 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static __always_inline int -kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +typedef int (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, unsigned long data); + +static __always_inline int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + rmap_handler_t handler) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1494,12 +1492,7 @@ kvm_handle_hva_range(struct kvm *kvm, } static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, int level, - unsigned long data)) + unsigned long data, rmap_handler_t handler) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 947c5555f325..c087df67f872 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -869,17 +869,15 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return ret; } -static __always_inline int -kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, - gfn_t start, - gfn_t end, - unsigned long data)) +typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long data); + +static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + tdp_handler_t handler) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; -- Gitee From dc4df8675a7453287544ce079f4d1310299d369c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 17:03:29 -0800 Subject: [PATCH 056/158] KVM: x86/mmu: Add convenience wrapper for acting on single hva in TDP MMU mainline inclusion from mainline-v5.13-rc1 commit 203219571330a591bc60b84ab052dbe0ccc52827 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=203219571330a591bc60b84ab052dbe0ccc52827 ---------------------------------------------------------------------- Add a TDP MMU helper to handle a single HVA hook, the name is a nice reminder that the flow in question is operating on a single HVA. No functional change intended. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210226010329.1766033-6-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c087df67f872..a279efd36952 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -912,6 +912,14 @@ static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, return ret; } +static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm, + unsigned long addr, + unsigned long data, + tdp_handler_t handler) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler); +} + static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_mmu_page *root, gfn_t start, @@ -999,8 +1007,7 @@ static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, - test_age_gfn); + return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn); } /* @@ -1061,9 +1068,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, pte_t *host_ptep) { - return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, - (unsigned long)host_ptep, - set_tdp_spte); + return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep, + set_tdp_spte); } /* -- Gitee From d8de8a73aa3c905a3a3580cee25f9e962be06ab3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:27 -0800 Subject: [PATCH 057/158] KVM: x86/mmu: Check for shadow-present SPTE before querying A/D status mainline inclusion from mainline-v5.13-rc1 commit 64bb2769d700f56dbb2f95705bb0732acddd00bf category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=64bb2769d700f56dbb2f95705bb0732acddd00bf ---------------------------------------------------------------------- When updating accessed and dirty bits, check that the new SPTE is present before attempting to query its A/D bits. Failure to confirm the SPTE is present can theoretically cause a false negative, e.g. if a MMIO SPTE replaces a "real" SPTE and somehow the PFNs magically match. Realistically, this is all but guaranteed to be a benign bug. Fix it up primarily so that a future patch can tweak the MMU_WARN_ON checking A/D status to fire if the SPTE is not-present. Fixes: f8e144971c68 ("kvm: x86/mmu: Add access tracking for tdp_mmu") Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a279efd36952..62c588b2156a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -192,13 +192,12 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { - bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) return; if (is_accessed_spte(old_spte) && - (!is_accessed_spte(new_spte) || pfn_changed)) + (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } @@ -442,7 +441,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_leaf && is_dirty_spte(old_spte) && - (!is_dirty_spte(new_spte) || pfn_changed)) + (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); /* -- Gitee From 669c846c3a5c2fa6f813d56bcdf66bbbc50b3e58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:28 -0800 Subject: [PATCH 058/158] KVM: x86/mmu: Bail from fast_page_fault() if SPTE is not shadow-present mainline inclusion from mainline-v5.13-rc1 commit ec89e643867148ab4a2a856a38717d2e89692be7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec89e643867148ab4a2a856a38717d2e89692be7 ---------------------------------------------------------------------- Bail from fast_page_fault() if the SPTE is not a shadow-present SPTE. Functionally, this is not strictly necessary as the !is_access_allowed() check will eventually reject the fast path, but an early check on shadow-present skips unnecessary checks and will allow a future patch to tweak the A/D status auditing to warn if KVM attempts to query A/D bits without first ensuring the SPTE is a shadow-present SPTE. Note, is_shadow_present_pte() is quite expensive at this time, i.e. this might be a net negative in the short term. A future patch will optimize is_shadow_present_pte() to a single AND operation and remedy the issue. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 98d19476215f..2a9f00129f42 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3063,6 +3063,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; + if (!is_shadow_present_pte(spte)) + break; + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; -- Gitee From e0be05ce0833bcac1502bdefb769b78ca5bf2ecb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:29 -0800 Subject: [PATCH 059/158] KVM: x86/mmu: Disable MMIO caching if MMIO value collides with L1TF mainline inclusion from mainline-v5.13-rc1 commit 44aaa0150bfd576dc5043094fd1a23699cf280e8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=44aaa0150bfd576dc5043094fd1a23699cf280e8 ---------------------------------------------------------------------- Disable MMIO caching if the MMIO value collides with the L1TF mitigation that usurps high PFN bits. In practice this should never happen as only CPUs with SME support can generate such a collision (because the MMIO value can theoretically get adjusted into legal memory), and no CPUs exist that support SME and are susceptible to L1TF. But, closing the hole is trivial. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c51ad544f25b..866e568fa6f0 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -245,8 +245,19 @@ u64 mark_spte_for_access_track(u64 spte) void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + + /* + * Disable MMIO caching if the MMIO value collides with the bits that + * are used to hold the relocated GFN when the L1TF mitigation is + * enabled. This should never fire as there is no known hardware that + * can trigger this condition, e.g. SME/SEV CPUs that require a custom + * MMIO value are not susceptible to L1TF. + */ + if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << + SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) + mmio_value = 0; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_access_mask = access_mask; } -- Gitee From f8e17a534eadbd9da064c9cc67d56caf0406342d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:31 -0800 Subject: [PATCH 060/158] KVM: x86/mmu: Don't install bogus MMIO SPTEs if MMIO caching is disabled mainline inclusion from mainline-v5.13-rc1 commit 30ab5901da57f16b919edfc4c5f8edf9311ba9c3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=30ab5901da57f16b919edfc4c5f8edf9311ba9c3 ---------------------------------------------------------------------- If MMIO caching is disabled, e.g. when using shadow paging on CPUs with 52 bits of PA space, go straight to MMIO emulation and don't install an MMIO SPTE. The SPTE will just generate a !PRESENT #PF, i.e. can't actually accelerate future MMIO. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-7-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 12 +++++++++++- arch/x86/kvm/mmu/spte.c | 7 ++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2a9f00129f42..96fe6a9745cf 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2948,9 +2948,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return true; } - if (unlikely(is_noslot_pfn(pfn))) + if (unlikely(is_noslot_pfn(pfn))) { vcpu_cache_mmio_info(vcpu, gva, gfn, access & shadow_mmio_access_mask); + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!shadow_mmio_value)) { + *ret_val = RET_PF_EMULATE; + return true; + } + } return false; } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 866e568fa6f0..0613a0a2d098 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -51,6 +51,8 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; + WARN_ON_ONCE(!shadow_mmio_value); + access &= shadow_mmio_access_mask; mask |= shadow_mmio_value | access; mask |= gpa | shadow_nonpresent_or_rsvd_mask; @@ -258,7 +260,10 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) mmio_value = 0; - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + if (mmio_value) + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + else + shadow_mmio_value = 0; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -- Gitee From fab5ec88b122e4b66d9e22b5de90031e417ac814 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:32 -0800 Subject: [PATCH 061/158] KVM: x86/mmu: Handle MMIO SPTEs directly in mmu_set_spte() mainline inclusion from mainline-v5.13-rc1 commit a54aa15c6bda3ca7e2f9e040ba968a1da303e24f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a54aa15c6bda3ca7e2f9e040ba968a1da303e24f ---------------------------------------------------------------------- Now that it should be impossible to convert a valid SPTE to an MMIO SPTE, handle MMIO SPTEs early in mmu_set_spte() without going through set_spte() and all the logic for removing an existing, valid SPTE. The other caller of set_spte(), FNAME(sync_page)(), explicitly handles MMIO SPTEs prior to calling set_spte(). This simplifies mmu_set_spte() and set_spte(), and also "fixes" an oddity where MMIO SPTEs are traced by both trace_kvm_mmu_set_spte() and trace_mark_mmio_spte(). Note, mmu_spte_set() will WARN if this new approach causes KVM to create an MMIO SPTE overtop a valid SPTE. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-8-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 96fe6a9745cf..7cc05ce85b8d 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -239,17 +239,6 @@ static unsigned get_mmio_spte_access(u64 spte) return spte & shadow_mmio_access_mask; } -static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access) -{ - if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(vcpu, sptep, gfn, access); - return true; - } - - return false; -} - static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; @@ -2570,9 +2559,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp; int ret; - if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) - return 0; - sp = sptep_to_sp(sptep); ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, @@ -2602,6 +2588,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); + if (unlikely(is_noslot_pfn(pfn))) { + mark_mmio_spte(vcpu, sptep, gfn, pte_access); + return RET_PF_EMULATE; + } + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink @@ -2635,9 +2626,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - if (unlikely(is_mmio_spte(*sptep))) - ret = RET_PF_EMULATE; - /* * The fault is fully spurious if and only if the new SPTE and old SPTE * are identical, and emulation is not required. -- Gitee From eb5d0192b3c5a55af4b8763e0e5ae8f14f336b6f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:33 -0800 Subject: [PATCH 062/158] KVM: x86/mmu: Drop redundant trace_kvm_mmu_set_spte() in the TDP MMU mainline inclusion from mainline-v5.13-rc1 commit 3849e0924ef14a245aa292ecaa9accdc4792012c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3849e0924ef14a245aa292ecaa9accdc4792012c ---------------------------------------------------------------------- Remove TDP MMU's call to trace_kvm_mmu_set_spte() that is done for both shadow-present SPTEs and MMIO SPTEs. It's fully redundant for the former, and unnecessary for the latter. This aligns TDP MMU tracing behavior with that of the legacy MMU. Fixes: 33dd3574f5fe ("kvm: x86/mmu: Add existing trace points to TDP MMU") Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-9-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 62c588b2156a..906fa0c7b6b4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -763,12 +763,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); ret = RET_PF_EMULATE; - } else + } else { trace_kvm_mmu_set_spte(iter->level, iter->gfn, rcu_dereference(iter->sptep)); + } - trace_kvm_mmu_set_spte(iter->level, iter->gfn, - rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; -- Gitee From 357051a5b4153c0a711598f0ce5222f45dc6c9b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:34 -0800 Subject: [PATCH 063/158] KVM: x86/mmu: Rename 'mask' to 'spte' in MMIO SPTE helpers mainline inclusion from mainline-v5.13-rc1 commit c236d9623f7801e89a7b13e29df6709f0b216961 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c236d9623f7801e89a7b13e29df6709f0b216961 ---------------------------------------------------------------------- The value returned by make_mmio_spte() is a SPTE, it is not a mask. Name it accordingly. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-10-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/mmu/spte.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7cc05ce85b8d..f59f9480e8f8 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -218,10 +218,10 @@ bool is_nx_huge_page_enabled(void) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { - u64 mask = make_mmio_spte(vcpu, gfn, access); + u64 spte = make_mmio_spte(vcpu, gfn, access); - trace_mark_mmio_spte(sptep, gfn, mask); - mmu_spte_set(sptep, mask); + trace_mark_mmio_spte(sptep, gfn, spte); + mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0613a0a2d098..03915788f39b 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -48,18 +48,18 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); + u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; WARN_ON_ONCE(!shadow_mmio_value); access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + spte |= shadow_mmio_value | access; + spte |= gpa | shadow_nonpresent_or_rsvd_mask; + spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; - return mask; + return spte; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -- Gitee From c9fe2e513dc05380355bcc4e91d9297ed8fb1772 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:35 -0800 Subject: [PATCH 064/158] KVM: x86/mmu: Stop using software available bits to denote MMIO SPTEs mainline inclusion from mainline-v5.13-rc1 commit 8120337a4c5502118e255b170799040eefe2f280 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8120337a4c5502118e255b170799040eefe2f280 ---------------------------------------------------------------------- Stop tagging MMIO SPTEs with specific available bits and instead detect MMIO SPTEs by checking for their unique SPTE value. The value is guaranteed to be unique on shadow paging and NPT as setting reserved physical address bits on any other type of SPTE would consistute a KVM bug. Ditto for EPT, as creating a WX non-MMIO would also be a bug. Note, this approach is also future-compatibile with TDX, which will need to reflect MMIO EPT violations as #VEs into the guest. To create an EPT violation instead of a misconfig, TDX EPTs will need to have RWX=0, But, MMIO SPTEs will also be the only case where KVM clears SUPPRESS_VE, so MMIO SPTEs will still be guaranteed to have a unique value within a given MMU context. The main motivation is to make it easier to reason about which types of SPTEs use which available bits. As a happy side effect, this frees up two more bits for storing the MMIO generation. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-11-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/spte.c | 11 ++++++----- arch/x86/kvm/mmu/spte.h | 10 ++++------ arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 3 ++- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f61e18dad2f3..e7dfb0db40e9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,7 +52,7 @@ static inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f59f9480e8f8..43fee738115d 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5906,7 +5906,7 @@ static void kvm_set_mmio_spte_mask(void) else mask = 0; - kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } static bool get_nx_auto_mode(void) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 03915788f39b..20fe15c21ee6 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -23,6 +23,7 @@ u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; u64 __read_mostly shadow_me_mask; @@ -163,6 +164,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: + WARN_ON(is_mmio_spte(spte)); *new_spte = spte; return ret; } @@ -244,7 +246,7 @@ u64 mark_spte_for_access_track(u64 spte) return spte; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); @@ -260,10 +262,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) mmio_value = 0; - if (mmio_value) - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - else - shadow_mmio_value = 0; + WARN_ON((mmio_value & mmio_mask) != mmio_value); + shadow_mmio_value = mmio_value; + shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 8a8e47bbf804..3c8d67af8d2e 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -8,15 +8,11 @@ #define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 54 -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ +/* The mask used to denote Access Tracking SPTEs. Note, val=3 is available. */ #define SPTE_SPECIAL_MASK (3ULL << 52) #define SPTE_AD_ENABLED_MASK (0ULL << 52) #define SPTE_AD_DISABLED_MASK (1ULL << 52) #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -92,6 +88,7 @@ extern u64 __read_mostly shadow_user_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_me_mask; @@ -161,7 +158,8 @@ extern u8 __read_mostly shadow_phys_bits; static inline bool is_mmio_spte(u64 spte) { - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; + return (spte & shadow_mmio_mask) == shadow_mmio_value && + likely(shadow_mmio_value); } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c10435ae3158..bd59fdc0686f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -870,7 +870,7 @@ static __init void svm_adjust_mmio_mask(void) */ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); } static void svm_hardware_teardown(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b5a57b0a8f5b..37894139ef3c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4585,7 +4585,8 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, + VMX_EPT_RWX_MASK, 0); } static inline int vmx_get_pid_table_order(struct kvm *kvm) -- Gitee From d9bb00db8f2ec794cf2521e7db44311aa6024f11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:37 -0800 Subject: [PATCH 065/158] KVM: x86/mmu: Rename and document A/D scheme for TDP SPTEs mainline inclusion from mainline-v5.13-rc1 commit 8a406c89532c91ee50688d4e728474dd09a11be3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8a406c89532c91ee50688d4e728474dd09a11be3 ---------------------------------------------------------------------- Rename the various A/D status defines to explicitly associated them with TDP. There is a subtle dependency on the bits in question never being set when using PAE paging, as those bits are reserved, not available. I.e. using these bits outside of TDP (technically EPT) would cause explosions. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-13-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- Documentation/virt/kvm/locking.rst | 37 +++++++++++++++--------------- arch/x86/kvm/mmu/spte.c | 17 ++++++++++---- arch/x86/kvm/mmu/spte.h | 34 ++++++++++++++++++++------- 3 files changed, 56 insertions(+), 32 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 0aa4817b466d..85876afe0441 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -38,12 +38,11 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the following two cases: 1. Access Tracking: The SPTE is not present, but it is marked for access - tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to - restore the saved R/X bits. This is described in more detail later below. + tracking. That means we need to restore the saved R/X bits. This is + described in more detail later below. -2. Write-Protection: The SPTE is present and the fault is - caused by write-protect. That means we just need to change the W bit of - the spte. +2. Write-Protection: The SPTE is present and the fault is caused by + write-protect. That means we just need to change the W bit of the spte. What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and SPTE_MMU_WRITEABLE bit on the spte: @@ -54,9 +53,9 @@ SPTE_MMU_WRITEABLE bit on the spte: page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or -restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This -is safe because whenever changing these bits can be detected by cmpxchg. +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, to +restore the saved R/X bits if for an access-traced spte, or both. This is +safe because whenever changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). Lockless Access Tracking: This is used for Intel CPUs that are using EPT but do not support the EPT A/D -bits. In this case, when the KVM MMU notifier is called to track accesses to a -page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present -by clearing the RWX bits in the PTE and storing the original R & X bits in -some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the -PTE (using the ignored bit 62). When the VM tries to access the page later on, -a fault is generated and the fast page fault mechanism described above is used -to atomically restore the PTE to a Present state. The W bit is not saved when -the PTE is marked for access tracking and during restoration to the Present -state, the W bit is set depending on whether or not it was a write access. If -it wasn't, then the W bit will remain clear until a write access happens, at -which time it will be set using the Dirty tracking mechanism described above. +bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and +when the KVM MMU notifier is called to track accesses to a page (via +kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware +by clearing the RWX bits in the PTE and storing the original R & X bits in more +unused/ignored bits. When the VM tries to access the page later on, a fault is +generated and the fast page fault mechanism described above is used to +atomically restore the PTE to a Present state. The W bit is not saved when the +PTE is marked for access tracking and during restoration to the Present state, +the W bit is set depending on whether or not it was a write access. If it +wasn't, then the W bit will remain clear until a write access happens, at which +time it will be set using the Dirty tracking mechanism described above. 3. Reference ------------ diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 20fe15c21ee6..3fdba5efe0bc 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -39,7 +39,7 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_TDP_AD_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -93,9 +93,16 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, int ret = 0; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; + + /* + * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set + * if PAE paging may be employed (shadow paging or any 32-bit KVM). + */ + WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && + (spte & SPTE_TDP_AD_MASK)); /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -177,7 +184,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) shadow_user_mask | shadow_x_mask | shadow_me_mask; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -282,7 +289,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); + BUG_ON(acc_track_mask & SPTE_TDP_AD_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 3c8d67af8d2e..168c113375d5 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -8,11 +8,24 @@ #define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 54 -/* The mask used to denote Access Tracking SPTEs. Note, val=3 is available. */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +/* + * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also + * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. + * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that + * is must be employed for a given TDP SPTE. + * + * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE + * paging, including NPT PAE. This scheme works because legacy shadow paging + * is guaranteed to have A/D bits and write-protection is forced only for + * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it + * must be restricted to 64-bit KVM. + */ +#define SPTE_TDP_AD_SHIFT 52 +#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -94,7 +107,7 @@ extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -170,13 +183,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; } static inline bool spte_ad_need_write_protect(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; + /* + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * and non-TDP SPTEs will never set these bits. Optimize for 64-bit + * TDP and do the A/D type check unconditionally. + */ + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) -- Gitee From cb5123fd661e058fd38f7dc9384d7244f44dfffa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:38 -0800 Subject: [PATCH 066/158] KVM: x86/mmu: Use MMIO SPTE bits 53 and 52 for the MMIO generation mainline inclusion from mainline-v5.13-rc1 commit b0de568018a6cd216ae060c33832e898f870abed category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b0de568018a6cd216ae060c33832e898f870abed ---------------------------------------------------------------------- Use bits 53 and 52 for the MMIO generation now that they're not used to identify MMIO SPTEs. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-14-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.c | 1 - arch/x86/kvm/mmu/spte.h | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 3fdba5efe0bc..31bf424f626a 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -39,7 +39,6 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_TDP_AD_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 168c113375d5..15c3a8297295 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -59,11 +59,11 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) /* - * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of + * Due to limited space in PTEs, the MMIO generation is a 20 bit subset of * the memslots generation and is derived as follows: * * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62 + * Bits 9-19 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -76,7 +76,7 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ @@ -88,7 +88,7 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) -- Gitee From 2e8853528da397bc6be67113c7afbce1e8298dfd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:39 -0800 Subject: [PATCH 067/158] KVM: x86/mmu: Document dependency bewteen TDP A/D type and saved bits mainline inclusion from mainline-v5.13-rc1 commit c4827eabe1a89ca82335a9e90bf8ed19a63ba063 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c4827eabe1a89ca82335a9e90bf8ed19a63ba063 ---------------------------------------------------------------------- Document that SHADOW_ACC_TRACK_SAVED_BITS_SHIFT is directly dependent on bits 53:52 being used to track the A/D type. Remove PT64_SECOND_AVAIL_BITS_SHIFT as it is at best misleading, and at worst wrong. For PAE paging, which arguably is a variant of PT64, the bits are reserved. For MMIO SPTEs the bits are not available as they're used for the MMIO generation. For access tracked SPTEs, they are also not available as bits 56:54 are used to store the original RX bits. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-15-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 15c3a8297295..ae8502721e28 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -6,7 +6,6 @@ #include "mmu_internal.h" #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 /* * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also @@ -128,11 +127,14 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. + * restored only when a write is attempted to the page. This mask obviously + * must not overlap the A/D type mask. */ #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ PT64_EPT_EXECUTABLE_MASK) -#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 +static_assert(!(SPTE_TDP_AD_MASK & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT))); /* * If a thread running without exclusive control of the MMU lock must perform a -- Gitee From 3112c3a1a3ba767e0148f2f10b0f07e21d1194e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:40 -0800 Subject: [PATCH 068/158] KVM: x86/mmu: Move initial kvm_mmu_set_mask_ptes() call into MMU proper mainline inclusion from mainline-v5.13-rc1 commit ec761cfd353f3b37072cda095d245c155c7cdb0f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec761cfd353f3b37072cda095d245c155c7cdb0f ---------------------------------------------------------------------- Move kvm_mmu_set_mask_ptes() into mmu.c as prep for future cleanup of the mask initialization code. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-16-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 4 ++++ arch/x86/kvm/x86.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 43fee738115d..4ee18465aece 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5988,6 +5988,10 @@ int kvm_mmu_vendor_module_init(void) kvm_set_mmio_spte_mask(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK, 0, sme_me_mask); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 396e32d81438..fe7e07ec04f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8407,9 +8407,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) -- Gitee From 9933e53101e6f1201cd705e452746b49a43e1337 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:41 -0800 Subject: [PATCH 069/158] KVM: x86/mmu: Co-locate code for setting various SPTE masks mainline inclusion from mainline-v5.13-rc1 commit d6b87f256591cf6be78825db6a09a5218666e539 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6b87f256591cf6be78825db6a09a5218666e539 ---------------------------------------------------------------------- Squish all the code for (re)setting the various SPTE masks into one location. With the split code, it's not at all clear that the masks are set once during module initialization. This will allow a future patch to clean up initialization of the masks without shuffling code all over tarnation. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-17-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/vmx/vmx.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 25 ------------------------- arch/x86/kvm/mmu/spte.c | 19 +++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 17 ++++++----------- 3 files changed, 25 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4ee18465aece..47e5ad8e4299 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5890,25 +5890,6 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - - /* - * Set a reserved PA bit in MMIO SPTEs to generate page faults with - * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT - * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports - * 52-bit physical addresses then there are no reserved PA bits in the - * PTEs and so the reserved PA approach must be disabled. - */ - if (shadow_phys_bits < 52) - mask = BIT_ULL(51) | PT_PRESENT_MASK; - else - mask = 0; - - kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); -} - static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -5986,12 +5967,6 @@ int kvm_mmu_vendor_module_init(void) kvm_mmu_reset_all_pte_masks(); - kvm_set_mmio_spte_mask(); - - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 31bf424f626a..a22a6202dc92 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -304,6 +304,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; + u64 mask; shadow_user_mask = 0; shadow_accessed_mask = 0; @@ -338,4 +339,22 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); + + /* + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. + */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); + + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK, 0, sme_me_mask); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 37894139ef3c..70b20f0b44a9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4579,16 +4579,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vmx->secondary_exec_control = exec_control; } -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK, 0); -} - static inline int vmx_get_pid_table_order(struct kvm *kvm) { return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); @@ -5863,7 +5853,12 @@ static void vmx_enable_tdp(void) cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, VMX_EPT_RWX_MASK, 0ull); - ept_set_mmio_spte_mask(); + /* + * EPT Misconfigurations can be generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + */ + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, + VMX_EPT_RWX_MASK, 0); } /* -- Gitee From 833fb3a376c4926893feb85da438cd177974441e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:42 -0800 Subject: [PATCH 070/158] KVM: x86/mmu: Move logic for setting SPTE masks for EPT into the MMU proper mainline inclusion from mainline-v5.13-rc1 commit e7b7bdea77f3277fe49f714c983d0f38f7cb0d86 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7b7bdea77f3277fe49f714c983d0f38f7cb0d86 ---------------------------------------------------------------------- Let the MMU deal with the SPTE masks to avoid splitting the logic and knowledge across the MMU and VMX. The SPTE masks that are used for EPT are very, very tightly coupled to the MMU implementation. The use of available bits, the existence of A/D types, the fact that shadow_x_mask even exists, and so on and so forth are all baked into the MMU implementation. Cross referencing the params to the masks is also a nightmare, as pretty much every param is a u64. A future patch will make the location of the MMU_WRITABLE and HOST_WRITABLE bits MMU specific, to free up bit 11 for a MMU_PRESENT bit. Doing that change with the current kvm_mmu_set_mask_ptes() would be an absolute mess. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-18-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 3 -- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/mmu/spte.c | 62 ++++++++++++++------------------- arch/x86/kvm/vmx/vmx.c | 20 ++--------- 4 files changed, 30 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e4d2a7d4084e..33df6a08f7d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1419,9 +1419,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e7dfb0db40e9..5835112026d3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -53,6 +53,7 @@ static inline u64 rsvd_bits(int s, int e) } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index a22a6202dc92..7c854097325d 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,6 +16,7 @@ #include "spte.h" #include +#include u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ @@ -275,45 +276,31 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_TDP_AD_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; + shadow_user_mask = VMX_EPT_READABLE_MASK; + shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; + shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_nx_mask = 0ull; + shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; + shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; + shadow_me_mask = 0ull; + + /* + * EPT Misconfigurations are generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + */ + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, + VMX_EPT_RWX_MASK, 0); } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; u64 mask; - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - shadow_phys_bits = kvm_get_shadow_phys_bits(); /* @@ -340,6 +327,15 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); + shadow_user_mask = PT_USER_MASK; + shadow_accessed_mask = PT_ACCESSED_MASK; + shadow_dirty_mask = PT_DIRTY_MASK; + shadow_nx_mask = PT64_NX_MASK; + shadow_x_mask = 0; + shadow_present_mask = PT_PRESENT_MASK; + shadow_acc_track_mask = 0; + shadow_me_mask = sme_me_mask; + /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT @@ -353,8 +349,4 @@ void kvm_mmu_reset_all_pte_masks(void) mask = 0; kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); - - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 70b20f0b44a9..591ea47a8f41 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5844,23 +5844,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK, 0); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -8391,7 +8374,8 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) - vmx_enable_tdp(); + kvm_mmu_set_ept_masks(enable_ept_ad_bits, + cpu_has_vmx_ept_execute_only()); if (!enable_ept) ept_lpage_level = 0; -- Gitee From 8f7660fc14fce8c2b42b9d272d000fa500e0af49 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:43 -0800 Subject: [PATCH 071/158] KVM: x86/mmu: Make Host-writable and MMU-writable bit locations dynamic mainline inclusion from mainline-v5.13-rc1 commit 5fc3424f8b854584f8f6fb6ea03f1419487fdc96 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5fc3424f8b854584f8f6fb6ea03f1419487fdc96 ---------------------------------------------------------------------- Make the location of the HOST_WRITABLE and MMU_WRITABLE configurable for a given KVM instance. This will allow EPT to use high available bits, which in turn will free up bit 11 for a constant MMU_PRESENT bit. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-19-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- Documentation/virt/kvm/locking.rst | 18 +++++++++--------- arch/x86/kvm/mmu.h | 12 ++++++------ arch/x86/kvm/mmu/mmu.c | 8 ++++---- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/mmu/spte.c | 13 +++++++++---- arch/x86/kvm/mmu/spte.h | 13 ++++++------- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 7 files changed, 38 insertions(+), 34 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 85876afe0441..1fc860c007a3 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -44,18 +44,18 @@ following two cases: 2. Write-Protection: The SPTE is present and the fault is caused by write-protect. That means we just need to change the W bit of the spte. -What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and -SPTE_MMU_WRITEABLE bit on the spte: +What we use to avoid all the race is the Host-writable bit and MMU-writable bit +on the spte: -- SPTE_HOST_WRITEABLE means the gfn is writable on host. -- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when - the gfn is writable on guest mmu and it is not write-protected by shadow - page write-protection. +- Host-writable means the gfn is writable in the host kernel page tables and in + its KVM memslot. +- MMU-writable means the gfn is writable in the guest's mmu and it is not + write-protected by shadow page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, to -restore the saved R/X bits if for an access-traced spte, or both. This is -safe because whenever changing these bits can be detected by cmpxchg. +bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved +R/X bits if for an access-traced spte, or both. This is safe because whenever +changing these bits can be detected by cmpxchg. But we need carefully check these cases: diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5835112026d3..d328ce51dddf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -118,7 +118,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * write-protects guest page to sync the guest modification, b) another one is * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences * between these two sorts are: - * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 1) the first case clears MMU-writable bit. * 2) the first case requires flushing tlb immediately avoiding corrupting * shadow page table between all vcpus so it should be in the protection of * mmu-lock. And the another case does not need to flush tlb until returning @@ -129,17 +129,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * So, there is the problem: the first case can meet the corrupted tlb caused * by another case which write-protects pages but without flush tlb * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit - * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * it flush tlb if we try to write-protect a spte whose MMU-writable bit + * is set, it works since another case never touches MMU-writable bit. * * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * changed) we need to check whether the spte with MMU-writable becomes * readonly, if that happens, we need to flush tlb. Fortunately, * mmu_spte_update() has already handled it perfectly. * - * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * The rules to use MMU-writable and PT_WRITABLE_MASK: * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * writable on the mmu mapping, check MMU-writable, this is the most * case, otherwise * - if we fix page fault on the spte or do write-protection by dirty logging, * check PT_WRITABLE_MASK. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 47e5ad8e4299..9ea7eb7a646a 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1104,7 +1104,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); if (pt_protect) - spte &= ~SPTE_MMU_WRITEABLE; + spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); @@ -5633,9 +5633,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * spte from present to present (changing the spte from present * to nonpresent will flush all the TLBs immediately), in other * words, the only case we care is mmu_spte_update() where we - * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE - * instead of PT_WRITABLE_MASK, that means it does not depend - * on PT_WRITABLE_MASK anymore. + * have checked Host-writable | MMU-writable instead of + * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK + * anymore. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 51687648e7e5..20a269bce359 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1085,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; + host_writable = sp->spt[i] & shadow_host_writable_mask; set_spte_ret |= set_spte(vcpu, &sp->spt[i], pte_access, PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 7c854097325d..a97f48a2161a 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -18,6 +18,8 @@ #include #include +u64 __read_mostly shadow_host_writable_mask; +u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; @@ -134,7 +136,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, kvm_is_mmio_pfn(pfn)); if (host_writable) - spte |= SPTE_HOST_WRITEABLE; + spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; @@ -144,7 +146,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; /* * Optimization: for pte sync, if spte was writable the hash @@ -160,7 +162,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } } @@ -199,7 +201,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_host_writable_mask; new_spte = mark_spte_for_access_track(new_spte); @@ -336,6 +338,9 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_acc_track_mask = 0; shadow_me_mask = sme_me_mask; + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index ae8502721e28..517c504a24c1 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,8 +5,6 @@ #include "mmu_internal.h" -#define PT_FIRST_AVAIL_BITS_SHIFT 10 - /* * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. @@ -53,9 +51,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(10) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(11) /* * Due to limited space in PTEs, the MMIO generation is a 20 bit subset of @@ -94,6 +91,8 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +extern u64 __read_mostly shadow_host_writable_mask; +extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; @@ -258,8 +257,8 @@ static inline bool is_dirty_spte(u64 spte) static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); + return (spte & shadow_host_writable_mask) && + (spte & shadow_mmu_writable_mask); } static inline u64 get_mmio_spte_generation(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 906fa0c7b6b4..c5663cea1946 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1383,7 +1383,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1397,7 +1397,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { new_spte = iter.old_spte & - ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); if (new_spte == iter.old_spte) break; @@ -1413,7 +1413,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, -- Gitee From 8fd9a745ef1a3e7d6a1a76326c76d738c41f2db6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:44 -0800 Subject: [PATCH 072/158] KVM: x86/mmu: Use high bits for host/mmu writable masks for EPT SPTEs mainline inclusion from mainline-v5.13-rc1 commit 613a3f3797528be489d280c35c4f6ebfcbe77e9e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=613a3f3797528be489d280c35c4f6ebfcbe77e9e ---------------------------------------------------------------------- Use bits 57 and 58 for HOST_WRITABLE and MMU_WRITABLE when using EPT. This will allow using bit 11 as a constant MMU_PRESENT, which is desirable as checking for a shadow-present SPTE is one of the most common SPTE operations in KVM, particular in hot paths such as page faults. EPT is short on low available bits; currently only bit 11 is the only always-available bit. Bit 10 is also available, but only while KVM doesn't support mode-based execution. On the other hand, PAE paging doesn't have _any_ high available bits. Thus, using bit 11 is the only feasible option for MMU_PRESENT. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-20-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.c | 3 +++ arch/x86/kvm/mmu/spte.h | 48 ++++++++++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index a97f48a2161a..513defacfb82 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -289,6 +289,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_me_mask = 0ull; + shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; + /* * EPT Misconfigurations are generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 517c504a24c1..e5a009a35b03 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -51,8 +51,39 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(10) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(11) +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. This mask obviously + * must not overlap the A/D type mask. + */ +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ + PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 +#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) +static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* + * Low ignored bits are at a premium for EPT, use high ignored bits, taking care + * to not overlap the A/D type mask or the saved access bits of access-tracked + * SPTEs when A/D bits are disabled. + */ +#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) +#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) + +static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* Defined only to keep the above static asserts readable. */ +#undef SHADOW_ACC_TRACK_SAVED_MASK /* * Due to limited space in PTEs, the MMIO generation is a 20 bit subset of @@ -122,19 +153,6 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. This mask obviously - * must not overlap the A/D type mask. - */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) -#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 -static_assert(!(SPTE_TDP_AD_MASK & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << - SHADOW_ACC_TRACK_SAVED_BITS_SHIFT))); - /* * If a thread running without exclusive control of the MMU lock must perform a * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a -- Gitee From 3faaccf6fc3c485476176ff9c16c4063a784b7c8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:45 -0800 Subject: [PATCH 073/158] KVM: x86/mmu: Use a dedicated bit to track shadow/MMU-present SPTEs mainline inclusion from mainline-v5.13-rc1 commit edea7c4fc215c7ee1cc98363b016ad505cbac9f7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=edea7c4fc215c7ee1cc98363b016ad505cbac9f7 ---------------------------------------------------------------------- Introduce MMU_PRESENT to explicitly track which SPTEs are "present" from the MMU's perspective. Checking for shadow-present SPTEs is a very common operation for the MMU, particularly in hot paths such as page faults. With the addition of "removed" SPTEs for the TDP MMU, identifying shadow-present SPTEs is quite costly especially since it requires checking multiple 64-bit values. On 64-bit KVM, this reduces the footprint of kvm.ko's .text by ~2k bytes. On 32-bit KVM, this increases the footprint by ~200 bytes, but only because gcc now inlines several more MMU helpers, e.g. drop_parent_pte(). We now need to drop bit 11, used for the MMU_PRESENT flag, from the set of bits used to store the generation number in MMIO SPTEs. Otherwise MMIO SPTEs with bit 11 set would get false positives for is_shadow_present_spte() and lead to a variety of fireworks, from oopses to likely hangs of the host kernel. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-21-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.c | 8 ++++---- arch/x86/kvm/mmu/spte.h | 23 +++++++++++++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 513defacfb82..29dc77f38c00 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -91,7 +91,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, bool can_unsync, bool host_writable, bool ad_disabled, u64 *new_spte) { - u64 spte = 0; + u64 spte = SPTE_MMU_PRESENT_MASK; int ret = 0; if (ad_disabled) @@ -180,10 +180,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { - u64 spte; + u64 spte = SPTE_MMU_PRESENT_MASK; - spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; + spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index e5a009a35b03..b932a98ac6d1 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,6 +5,15 @@ #include "mmu_internal.h" +/* + * A MMU present SPTE is backed by actual memory and may or may not be present + * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it + * is ignored by all flavors of SPTEs and checking a low bit often generates + * better code than for a high bit, e.g. 56+. MMU present checks are pervasive + * enough that the improved code generation is noticeable in KVM's footprint. + */ +#define SPTE_MMU_PRESENT_MASK BIT_ULL(11) + /* * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. @@ -86,11 +95,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); #undef SHADOW_ACC_TRACK_SAVED_MASK /* - * Due to limited space in PTEs, the MMIO generation is a 20 bit subset of + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-19 of the MMIO generation are propagated to spte bits 52-62 + * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 + * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -101,7 +110,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); */ #define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_END 10 #define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 @@ -110,12 +119,14 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); MMIO_SPTE_GEN_LOW_START) #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) +static_assert(!(SPTE_MMU_PRESENT_MASK & + (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 11); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) @@ -235,7 +246,7 @@ static inline bool is_access_track_spte(u64 spte) static inline bool is_shadow_present_pte(u64 pte) { - return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte); + return !!(pte & SPTE_MMU_PRESENT_MASK); } static inline bool is_large_pte(u64 pte) -- Gitee From 824c87c575cf50476f632c78a4f945ee18daf841 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:46 -0800 Subject: [PATCH 074/158] KVM: x86/mmu: Tweak auditing WARN for A/D bits to !PRESENT (was MMIO) mainline inclusion from mainline-v5.13-rc1 commit 8f366ae6d8c5eaaae086016934802954abb8959e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8f366ae6d8c5eaaae086016934802954abb8959e ---------------------------------------------------------------------- Tweak the MMU_WARN that guards against weirdness when querying A/D status to fire on a !MMU_PRESENT SPTE, as opposed to a MMIO SPTE. Attempting to query A/D status on any kind of !MMU_PRESENT SPTE, MMIO or otherwise, indicates a KVM bug. Case in point, several now-fixed bugs were identified by enabling this new WARN. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-22-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index b932a98ac6d1..efb9a1057cd4 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -205,6 +205,11 @@ static inline bool is_mmio_spte(u64 spte) likely(shadow_mmio_value); } +static inline bool is_shadow_present_pte(u64 pte) +{ + return !!(pte & SPTE_MMU_PRESENT_MASK); +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -212,13 +217,13 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); /* * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit @@ -229,13 +234,13 @@ static inline bool spte_ad_need_write_protect(u64 spte) static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -244,11 +249,6 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline bool is_shadow_present_pte(u64 pte) -{ - return !!(pte & SPTE_MMU_PRESENT_MASK); -} - static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; -- Gitee From cef8a6492b13f5612c8562843c03aecc1ae68ae4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:47 -0800 Subject: [PATCH 075/158] KVM: x86/mmu: Use is_removed_spte() instead of open coded equivalents mainline inclusion from mainline-v5.13-rc1 commit 7a51393ae0f6287f9b2bdd138f5a091b7c302c1f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7a51393ae0f6287f9b2bdd138f5a091b7c302c1f ---------------------------------------------------------------------- Use the is_removed_spte() helper instead of open coding the check. No functional change intended. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-23-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c5663cea1946..038db3f7f75f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -484,7 +484,7 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, * Do not change removed SPTEs. Only the thread that froze the SPTE * may modify it. */ - if (iter->old_spte == REMOVED_SPTE) + if (is_removed_spte(iter->old_spte)) return false; if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, @@ -555,7 +555,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(iter->old_spte == REMOVED_SPTE); + WARN_ON(is_removed_spte(iter->old_spte)); WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); -- Gitee From bebb471f3d40e20f982ff085eba5d347ccb47a3f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 26 Feb 2022 00:15:29 +0000 Subject: [PATCH 076/158] KVM: x86/mmu: WARN if old _or_ new SPTE is REMOVED in non-atomic path mainline inclusion from mainline-v5.18-rc1 commit 966da62adaf1792c56fe909fc78f3833208ef5ae category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=966da62adaf1792c56fe909fc78f3833208ef5ae ---------------------------------------------------------------------- WARN if the new_spte being set by __tdp_mmu_set_spte() is a REMOVED_SPTE, which is called out by the comment as being disallowed but not actually checked. Keep the WARN on the old_spte as well, because overwriting a REMOVED_SPTE in the non-atomic path is also disallowed (as evidence by lack of splats with the existing WARN). Fixes: 08f07c800e9d ("KVM: x86/mmu: Flush TLBs after zap in TDP MMU PF handler") Cc: Ben Gardon Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Message-Id: <20220226001546.360188-12-seanjc@google.com> Reviewed-by: Mingwei Zhang Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 038db3f7f75f..f41137a118c8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -549,13 +549,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, lockdep_assert_held_write(&kvm->mmu_lock); /* - * No thread should be using this function to set SPTEs to the + * No thread should be using this function to set SPTEs to or from the * temporary removed SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(is_removed_spte(iter->old_spte)); + WARN_ON(is_removed_spte(iter->old_spte) || is_removed_spte(new_spte)); WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); -- Gitee From 562a3c10ed3ba5756b637cd67c4d3b6d1627ef32 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:48 -0800 Subject: [PATCH 077/158] KVM: x86/mmu: Use low available bits for removed SPTEs mainline inclusion from mainline-v5.13-rc1 commit 715f1079eee12f629b2de5c8a9489124a5af0a18 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=715f1079eee12f629b2de5c8a9489124a5af0a18 ---------------------------------------------------------------------- Use low "available" bits to tag REMOVED SPTEs. Using a high bit is moderately costly as it often causes the compiler to generate a 64-bit immediate. More importantly, this makes it very clear REMOVED_SPTE is a value, not a flag. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-24-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.c | 11 ++++++++++- arch/x86/kvm/mmu/spte.h | 11 +++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 29dc77f38c00..51e58543bb83 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -271,7 +271,16 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) mmio_value = 0; - WARN_ON((mmio_value & mmio_mask) != mmio_value); + /* + * The masked MMIO value must obviously match itself and a removed SPTE + * must not get a false positive. Removed SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and removed SPTEs must + * not set any RWX bits. + */ + if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || + WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + mmio_value = 0; + shadow_mmio_value = mmio_value; shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index efb9a1057cd4..6925dfc38981 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -170,13 +170,16 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * - * This constant works because it is considered non-present on both AMD and - * Intel CPUs and does not create a L1TF vulnerability because the pfn section - * is zeroed out. + * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on + * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * vulnerability. Use only low bits to avoid 64-bit immediates. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE (1ull << 59) +#define REMOVED_SPTE 0x5a0ULL + +/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_removed_spte(u64 spte) { -- Gitee From 024782cc61f1a11523d261dfda890fae5573d3d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Dec 2020 16:31:38 -0800 Subject: [PATCH 078/158] KVM: x86/mmu: Use raw level to index into MMIO walks' sptes array mainline inclusion from mainline-v5.11-rc3 commit dde81f9477d018a96fba991c5928c6ab8cc109f8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dde81f9477d018a96fba991c5928c6ab8cc109f8 ---------------------------------------------------------------------- Bump the size of the sptes array by one and use the raw level of the SPTE to index into the sptes array. Using the SPTE level directly improves readability by eliminating the need to reason out why the level is being adjusted when indexing the array. The array is on the stack and is not explicitly initialized; bumping its size is nothing more than a superficial adjustment to the stack frame. Signed-off-by: Sean Christopherson Message-Id: <20201218003139.2167891-4-seanjc@google.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 20 +++++++------------- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9ea7eb7a646a..205ac43ff049 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3544,7 +3544,7 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); - sptes[leaf - 1] = spte; + sptes[leaf] = spte; if (!is_shadow_present_pte(spte)) break; @@ -3558,7 +3558,7 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level /* return true if reserved bit is detected on spte. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { - u64 sptes[PT64_ROOT_MAX_LEVEL]; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; int root, leaf, level; bool reserved = false; @@ -3581,16 +3581,10 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level - 1])) + if (!is_shadow_present_pte(sptes[level])) break; - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the - * reserved bit and EPT's invalid memtype/XWR checks to avoid - * adding a Jcc in the loop. - */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) || - __is_rsvd_bits_set(rsvd_check, sptes[level - 1], - level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) || + __is_rsvd_bits_set(rsvd_check, sptes[level], level); } if (reserved) { @@ -3598,10 +3592,10 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[level - 1], level); + sptes[level], level); } - *sptep = sptes[leaf - 1]; + *sptep = sptes[leaf]; return reserved; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index f41137a118c8..37924ced9757 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1452,7 +1452,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; - sptes[leaf - 1] = iter.old_spte; + sptes[leaf] = iter.old_spte; } rcu_read_unlock(); -- Gitee From 1f542fe86e132d4246e8a7a364d196049df47d06 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Dec 2020 16:31:39 -0800 Subject: [PATCH 079/158] KVM: x86/mmu: Optimize not-present/MMIO SPTE check in get_mmio_spte() mainline inclusion from mainline-v5.11-rc3 commit 9aa418792f5f11ef5d6f72265e1f8ae07efd5784 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9aa418792f5f11ef5d6f72265e1f8ae07efd5784 ---------------------------------------------------------------------- Check only the terminal leaf for a "!PRESENT || MMIO" SPTE when looking for reserved bits on valid, non-MMIO SPTEs. The get_walk() helpers terminate their walks if a not-present or MMIO SPTE is encountered, i.e. the non-terminal SPTEs have already been verified to be regular SPTEs. This eliminates an extra check-and-branch in a relatively hot loop. Signed-off-by: Sean Christopherson Message-Id: <20201218003139.2167891-5-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 205ac43ff049..8bcb45fce8ff 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3555,7 +3555,7 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level return leaf; } -/* return true if reserved bit is detected on spte. */ +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; @@ -3578,14 +3578,22 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) return reserved; } + *sptep = sptes[leaf]; + + /* + * Skip reserved bits checks on the terminal leaf if it's not a valid + * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by + * design, always have reserved bits set. The purpose of the checks is + * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. + */ + if (!is_shadow_present_pte(sptes[leaf])) + leaf++; + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; - for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level])) - break; + for (level = root; level >= leaf; level--) reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) || __is_rsvd_bits_set(rsvd_check, sptes[level], level); - } if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", @@ -3595,8 +3603,6 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) sptes[level], level); } - *sptep = sptes[leaf]; - return reserved; } -- Gitee From 556d433b18113aaaeef79571f6fde7a4ffcc9361 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Feb 2021 12:47:49 -0800 Subject: [PATCH 080/158] KVM: x86/mmu: Dump reserved bits if they're detected on non-MMIO SPTE mainline inclusion from mainline-v5.13-rc1 commit bb4cdf3af9395d50c731d86c15454105a31eb9e3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb4cdf3af9395d50c731d86c15454105a31eb9e3 ---------------------------------------------------------------------- Debugging unexpected reserved bit page faults sucks. Dump the reserved bits that (likely) caused the page fault to make debugging suck a little less. Signed-off-by: Sean Christopherson Message-Id: <20210225204749.1512652-25-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8bcb45fce8ff..15fe33b0e7ae 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3596,11 +3596,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) __is_rsvd_bits_set(rsvd_check, sptes[level], level); if (reserved) { - pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) - pr_err("------ spte 0x%llx level %d.\n", - sptes[level], level); + pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", + sptes[level], level, + rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); } return reserved; -- Gitee From 56861351e432e450909ca6fd816bf6609f085988 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:04 -0800 Subject: [PATCH 081/158] KVM: x86/mmu: Split out max mapping level calculation to helper mainline inclusion from mainline-v5.12-rc1 commit 1b6d9d9ed5717157933db77d96bb12884c17ce52 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1b6d9d9ed5717157933db77d96bb12884c17ce52 ---------------------------------------------------------------------- Factor out the logic for determining the maximum mapping level given a memslot and a gpa. The helper will be used when zapping collapsible SPTEs when disabling dirty logging, e.g. to avoid zapping SPTEs that can't possibly be rebuilt as hugepages. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 37 ++++++++++++++++++++------------- arch/x86/kvm/mmu/mmu_internal.h | 2 ++ 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 15fe33b0e7ae..4c41a9179a24 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2741,8 +2741,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - kvm_pfn_t pfn, struct kvm_memory_slot *slot) +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + struct kvm_memory_slot *slot) { unsigned long hva; pte_t *pte; @@ -2761,19 +2761,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, */ hva = __gfn_to_hva_memslot(slot, gfn); - pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + pte = lookup_address_in_mm(kvm->mm, hva, &level); if (unlikely(!pte)) return PG_LEVEL_4K; return level; } +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t pfn, int max_level) +{ + struct kvm_lpage_info *linfo; + + max_level = min(max_level, max_huge_page_level); + for ( ; max_level > PG_LEVEL_4K; max_level--) { + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) + break; + } + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + return host_pfn_mapping_level(kvm, gfn, pfn, slot); +} + int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; @@ -2790,17 +2807,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_huge_page_level); - for ( ; max_level > PG_LEVEL_4K; max_level--) { - linfo = lpage_info_slot(gfn, slot, max_level); - if (!linfo->disallow_lpage) - break; - } - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); + level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); if (level == PG_LEVEL_4K) return level; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1d0ff532e9e0..5808127ca4fa 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -143,6 +143,8 @@ enum { #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level); -- Gitee From b05b7694a3f1e5c691f15eba8eb0a90c9895957a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:05 -0800 Subject: [PATCH 082/158] KVM: x86/mmu: Pass the memslot to the rmap callbacks mainline inclusion from mainline-v5.12-rc1 commit 0a234f5dd06582e82edec7cf17a0f971c5a4142e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a234f5dd06582e82edec7cf17a0f971c5a4142e ---------------------------------------------------------------------- Pass the memslot to the rmap callbacks, it will be used when zapping collapsible SPTEs to verify the memslot is compatible with hugepages before zapping its SPTEs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4c41a9179a24..8391fab5a54b 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1151,7 +1151,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ -static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1182,7 +1183,8 @@ static bool spte_set_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1246,7 +1248,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); - __rmap_clear_dirty(kvm, rmap_head); + __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; @@ -1302,7 +1304,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1322,7 +1325,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmap_head); + return kvm_zap_rmapp(kvm, rmap_head, slot); } static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -5306,7 +5309,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool @@ -5320,7 +5324,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + flush |= fn(kvm, iterator.rmap, memslot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { @@ -5616,7 +5620,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } @@ -5650,7 +5655,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; -- Gitee From 9fc4f2aca094cc12aefe0bc9a599818b42644ae5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:06 -0800 Subject: [PATCH 083/158] KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs mainline inclusion from mainline-v5.12-rc1 commit 9eba50f8d7fcb61774f160890f98239fa3ab68a6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9eba50f8d7fcb61774f160890f98239fa3ab68a6 ---------------------------------------------------------------------- When zapping SPTEs in order to rebuild them as huge pages, use the new helper that computes the max mapping level to detect whether or not a SPTE should be zapped. Doing so avoids zapping SPTEs that can't possibly be rebuilt as huge pages, e.g. due to hardware constraints, memslot alignment, etc... This also avoids zapping SPTEs that are still large, e.g. if migration was canceled before write-protected huge pages were shattered to enable dirty logging. Note, such pages are still write-protected at this time, i.e. a page fault VM-Exit will still occur. This will hopefully be addressed in a future patch. Sadly, TDP MMU loses its const on the memslot, but that's a pervasive problem that's been around for quite some time. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-6-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- arch/x86/kvm/mmu/tdp_mmu.c | 13 +++++++------ arch/x86/kvm/mmu/tdp_mmu.h | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8391fab5a54b..c8f145fad009 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5677,8 +5677,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - (kvm_is_zone_device_pfn(pfn) || - PageCompound(pfn_to_page(pfn)))) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, + pfn, PG_LEVEL_NUM)) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -5698,12 +5698,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ + struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; + write_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, - kvm_mmu_zap_collapsible_spte, true); + slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 37924ced9757..35a82e9c75ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1327,8 +1327,10 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) + struct kvm_memory_slot *slot) { + gfn_t start = slot->base_gfn; + gfn_t end = start + slot->npages; struct tdp_iter iter; kvm_pfn_t pfn; bool spte_set = false; @@ -1347,8 +1349,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, pfn = spte_to_pfn(iter.old_spte); if (kvm_is_reserved_pfn(pfn) || - (!PageCompound(pfn_to_page(pfn)) && - !kvm_is_zone_device_pfn(pfn))) + iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, + pfn, PG_LEVEL_NUM)) continue; tdp_mmu_set_spte(kvm, &iter, 0); @@ -1366,7 +1368,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, * be replaced by large mappings, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) + struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; int root_as_id; @@ -1376,8 +1378,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, if (root_as_id != slot->as_id) continue; - zap_collapsible_spte_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); + zap_collapsible_spte_range(kvm, root, slot); } } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1e2b32f76f99..b10e155ffdfd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -51,7 +51,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool wrprot); bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); + struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); -- Gitee From 6ccb69356de523e6f346bb0d5cd17bea1f720b7e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:07 -0800 Subject: [PATCH 084/158] KVM: nVMX: Disable PML in hardware when running L2 mainline inclusion from mainline-v5.12-rc1 commit c3bb9a20834ffe72d3031afe460ff03d3b3b6e90 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c3bb9a20834ffe72d3031afe460ff03d3b3b6e90 ---------------------------------------------------------------------- Unconditionally disable PML in vmcs02, KVM emulates PML purely in the MMU, e.g. vmx_flush_pml_buffer() doesn't even try to copy the L2 GPAs from vmcs02's buffer to vmcs12. At best, enabling PML is a nop. At worst, it will cause vmx_flush_pml_buffer() to record bogus GFNs in the dirty logs. Initialize vmcs02.GUEST_PML_INDEX such that PML writes would trigger VM-Exit if PML was somehow enabled, skip flushing the buffer for guest mode since the index is bogus, and freak out if a PML full exit occurs when L2 is active. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-7-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++-------------- arch/x86/kvm/vmx/vmx.c | 12 ++++++++++-- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 50b8015726e2..26f4febd2e4c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2183,15 +2183,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); /* - * The PML address never changes, so it is constant in vmcs02. - * Conceptually we want to copy the PML index from vmcs01 here, - * and then back to vmcs01 on nested vmexit. But since we flush - * the log and reset GUEST_PML_INDEX on each vmexit, the PML - * index is also effectively constant in vmcs02. + * PML is emulated for L2, but never enabled in hardware as the MMU + * handles A/D emulation. Disabling PML for L2 also avoids having to + * deal with filtering out L2 GPAs from the buffer. */ if (enable_pml) { - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write64(PML_ADDRESS, 0); + vmcs_write16(GUEST_PML_INDEX, -1); } if (cpu_has_vmx_encls_vmexit()) @@ -2230,7 +2228,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, struct vmcs12 *vmcs12) { - u32 exec_control, vmcs12_exec_ctrl; + u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) @@ -2306,11 +2304,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & - ~SECONDARY_EXEC_ENABLE_PML; - exec_control |= vmcs12_exec_ctrl; - } + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + exec_control |= vmcs12->secondary_vm_exec_control; + + /* PML is emulated and never enabled in hardware for L2. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; @@ -5919,7 +5917,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_PREEMPTION_TIMER: return true; case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ + /* + * PML is emulated for an L1 VMM and should never be enabled in + * vmcs02, always "handle" PML_FULL by exiting to userspace. + */ return true; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 591ea47a8f41..546be0ee1d4e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6359,9 +6359,10 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been - * flushed already. + * flushed already. Note, PML is never enabled in hardware while + * running L2. */ - if (enable_pml) + if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); /* @@ -6377,6 +6378,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return handle_invalid_guest_state(vcpu); if (is_guest_mode(vcpu)) { + /* + * PML is never enabled when running L2, bail immediately if a + * PML full exit occurs as something is horribly wrong. + */ + if (exit_reason.basic == EXIT_REASON_PML_FULL) + goto unexpected_vmexit; + /* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC -- Gitee From b1ad18d1fc136611d730598d0e44323b0e226031 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:08 -0800 Subject: [PATCH 085/158] KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect() mainline inclusion from mainline-v5.12-rc1 commit 2855f98265dc579bd2becb79ce0156d08e0df813 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2855f98265dc579bd2becb79ce0156d08e0df813 ---------------------------------------------------------------------- Expand the comment about need to use write-protection for nested EPT when PML is enabled to clarify that the tagging is a nop when PML is _not_ enabled. Without the clarification, omitting the PML check looks wrong at first^Wfifth glance. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-8-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5808127ca4fa..5d7ad5f25072 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -89,7 +89,10 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) * When using the EPT page-modification log, the GPAs in the log * would come from L2 rather than L1. Therefore, we need to rely * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. + * PML, since writes now result in a vmexit. Note, this helper will + * tag SPTEs as needing write-protection even if PML is disabled or + * unsupported, but that's ok because the tag is consumed if and only + * if PML is enabled. Omit the PML check to save a few uops. */ return vcpu->arch.mmu == &vcpu->arch.guest_mmu; } -- Gitee From 23d639e93071b6b832ac22971e1986ca5906afb2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Feb 2021 16:50:11 -0800 Subject: [PATCH 086/158] KVM: x86: Further clarify the logic and comments for toggling log dirty mainline inclusion from mainline-v5.12-rc1 commit 52f4607940b18337f01d160aaae346eaac388bf7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=52f4607940b18337f01d160aaae346eaac388bf7 ---------------------------------------------------------------------- Add a sanity check in kvm_mmu_slot_apply_flags to assert that the LOG_DIRTY_PAGES flag is indeed being toggled, and explicitly rely on that holding true when zapping collapsible SPTEs. Manipulating the CPU dirty log (PML) and write-protection also relies on this assertion, but that's not obvious in the current code. Signed-off-by: Sean Christopherson Message-Id: <20210213005015.1651772-11-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/x86.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe7e07ec04f4..bc2d26f67bf2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11106,12 +11106,20 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, enum kvm_mr_change change) { /* - * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. - * See comments below. + * Nothing to do for RO slots (which can't be dirtied and can't be made + * writable) or CREATE/MOVE/DELETE of a slot. See comments below. */ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) return; + /* + * READONLY and non-flags changes were filtered out above, and the only + * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty + * logging isn't being toggled on or off. + */ + if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + return; + /* * Dirty logging tracks sptes in 4k granularity, meaning that large * sptes have to be split. If live migration is successful, the guest @@ -11129,8 +11137,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot() */ - if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); /* -- Gitee From e5e2fe8343ccfc9139a71298345a9ec160288614 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 2 Apr 2021 08:05:19 -0400 Subject: [PATCH 087/158] KVM: MMU: protect TDP MMU pages only down to required level mainline inclusion from mainline-v5.13-rc1 commit dbb6964e4c38509936719223530acb1870cd6e86 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dbb6964e4c38509936719223530acb1870cd6e86 ---------------------------------------------------------------------- When using manual protection of dirty pages, it is not necessary to protect nested page tables down to the 4K level; instead KVM can protect only hugepages in order to split them lazily, and delay write protection at 4K-granularity until KVM_CLEAR_DIRTY_LOG. This was overlooked in the TDP MMU, so do it there as well. Fixes: a6a0b05da9f37 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") Cc: Ben Gardon Reviewed-by: Keqian Zhu Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8f145fad009..bd1ed4987aff 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5636,7 +5636,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); write_unlock(&kvm->mmu_lock); /* -- Gitee From 5ef383b34f676e2a4f5256d0c6b1a1199d2d02ba Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 8 Apr 2021 08:10:25 -0400 Subject: [PATCH 088/158] KVM: MMU: load PDPTRs outside mmu_lock mainline inclusion from mainline-v5.13-rc1 commit 4a38162ee9f10f5f67c36f4f5aa4f6be2657efd5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4a38162ee9f10f5f67c36f4f5aa4f6be2657efd5 ---------------------------------------------------------------------- On SVM, reading PDPTRs might access guest memory, which might fault and thus might sleep. On the other hand, it is not possible to release the lock after make_mmu_pages_available has been called. Therefore, push the call to make_mmu_pages_available and the mmu_lock critical section within mmu_alloc_direct_roots and mmu_alloc_shadow_roots. Reported-by: Wanpeng Li Co-developed-by: Sean Christopherson Signed-off-by: Paolo Bonzini conflicts: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 52 ++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bd1ed4987aff..bd5e992ee506 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3253,6 +3253,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) u8 shadow_root_level = mmu->shadow_root_level; hpa_t root; unsigned i; + int r; + + write_lock(&vcpu->kvm->mmu_lock); + r = make_mmu_pages_available(vcpu); + if (r < 0) + goto out_unlock; if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); @@ -3261,8 +3267,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { - if (WARN_ON_ONCE(!mmu->pae_root)) - return -EIO; + if (WARN_ON_ONCE(!mmu->pae_root)) { + r = -EIO; + goto out_unlock; + } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(mmu->pae_root[i] && @@ -3276,13 +3284,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) mmu->root_hpa = __pa(mmu->pae_root); } else { WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); - return -EIO; + r = -EIO; + goto out_unlock; } /* root_pgd is ignored for direct MMUs. */ mmu->root_pgd = 0; - - return 0; +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); + return r; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) @@ -3291,7 +3301,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; hpa_t root; - int i; + unsigned i; + int r; root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; @@ -3299,6 +3310,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu_check_root(vcpu, root_gfn)) return 1; + /* + * On SVM, reading PDPTRs might access guest memory, which might fault + * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. + */ if (mmu->root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { pdptrs[i] = mmu->get_pdptr(vcpu, i); @@ -3310,6 +3325,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } + write_lock(&vcpu->kvm->mmu_lock); + r = make_mmu_pages_available(vcpu); + if (r < 0) + goto out_unlock; + /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. @@ -3321,8 +3341,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) goto set_root_pgd; } - if (WARN_ON_ONCE(!mmu->pae_root)) - return -EIO; + if (WARN_ON_ONCE(!mmu->pae_root)) { + r = -EIO; + goto out_unlock; + } /* * We shadow a 32 bit page table. This may be a legacy 2-level @@ -3333,8 +3355,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - if (WARN_ON_ONCE(!mmu->pml4_root)) - return -EIO; + if (WARN_ON_ONCE(!mmu->pml4_root)) { + r = -EIO; + goto out_unlock; + } mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; @@ -3367,6 +3391,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) set_root_pgd: mmu->root_pgd = root_pgd; +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); return 0; } @@ -4919,14 +4945,10 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_alloc_special_roots(vcpu); if (r) goto out; - write_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu)) - r = -ENOSPC; - else if (vcpu->arch.mmu->direct_map) + if (vcpu->arch.mmu->direct_map) r = mmu_alloc_direct_roots(vcpu); else r = mmu_alloc_shadow_roots(vcpu); - write_unlock(&vcpu->kvm->mmu_lock); if (r) goto out; -- Gitee From f01d09093b0bed58ab3578c2bfb2c956ad1795ea Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 1 Mar 2022 20:49:41 +0800 Subject: [PATCH 089/158] KVM: x86/mmu: Passing up the error state of mmu_alloc_shadow_roots() mainline inclusion from mainline-v5.17-rc7 commit c6c937d673aaa1d603f62f134e1ca9c173eeeed3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c6c937d673aaa1d603f62f134e1ca9c173eeeed3 ---------------------------------------------------------------------- Just like on the optional mmu_alloc_direct_roots() path, once shadow path reaches "r = -EIO" somewhere, the caller needs to know the actual state in order to enter error handling and avoid something worse. Fixes: 4a38162ee9f1 ("KVM: MMU: load PDPTRs outside mmu_lock") Signed-off-by: Like Xu Reviewed-by: Sean Christopherson Message-Id: <20220301124941.48412-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bd5e992ee506..a9396ff15050 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3394,7 +3394,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - return 0; + return r; } static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) -- Gitee From 4b6ef1eac953df466d1d4bcf866bb5b0ebc8bedb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:40 -0700 Subject: [PATCH 090/158] KVM: x86/mmu: Coalesce TDP MMU TLB flushes when zapping collapsible SPTEs mainline inclusion from mainline-v5.13-rc1 commit af95b53e56e34a4df343cec32b3a3276d9d06ad3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=af95b53e56e34a4df343cec32b3a3276d9d06ad3 ---------------------------------------------------------------------- When zapping collapsible SPTEs across multiple roots, gather pending flushes and perform a single remote TLB flush at the end, as opposed to flushing after processing every root. Note, flush may be cleared by the result of zap_collapsible_spte_range(). This is intended and correct, e.g. yielding may have serviced a prior pending flush. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 35a82e9c75ad..89312e748f0e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1325,21 +1325,21 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static void zap_collapsible_spte_range(struct kvm *kvm, +static bool zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - struct kvm_memory_slot *slot) + struct kvm_memory_slot *slot, + bool flush) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; kvm_pfn_t pfn; - bool spte_set = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { - spte_set = false; + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; continue; } @@ -1355,12 +1355,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm, tdp_mmu_set_spte(kvm, &iter, 0); - spte_set = true; + flush = true; } rcu_read_unlock(); - if (spte_set) - kvm_flush_remote_tlbs(kvm); + + return flush; } /* @@ -1371,6 +1371,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; + bool flush = false; int root_as_id; for_each_tdp_mmu_root_yield_safe(kvm, root) { @@ -1378,8 +1379,11 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, if (root_as_id != slot->as_id) continue; - zap_collapsible_spte_range(kvm, root, slot); + flush = zap_collapsible_spte_range(kvm, root, slot, flush); } + + if (flush) + kvm_flush_remote_tlbs(kvm); } /* -- Gitee From 37361e1b1129041f58f560600508d37f95dde3b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:41 -0700 Subject: [PATCH 091/158] KVM: x86/mmu: Move flushing for "slot" handlers to caller for legacy MMU mainline inclusion from mainline-v5.13-rc1 commit 302695a5747e82267c344d177cdd4866cbccee8e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=302695a5747e82267c344d177cdd4866cbccee8e ---------------------------------------------------------------------- Place the onus on the caller of slot_handle_*() to flush the TLB, rather than handling the flush in the helper, and rename parameters accordingly. This will allow future patches to coalesce flushes between address spaces and between the legacy and TDP MMUs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9396ff15050..59dbf41d0849 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5338,7 +5338,7 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_ static __always_inline bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) + gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield) { struct slot_rmap_walk_iterator iterator; bool flush = false; @@ -5349,7 +5349,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, flush |= fn(kvm, iterator.rmap, memslot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { + if (flush && flush_on_yield) { kvm_flush_remote_tlbs_with_address(kvm, start_gfn, iterator.gfn - start_gfn + 1); @@ -5359,24 +5359,18 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } } - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs_with_address(kvm, start_gfn, - end_gfn - start_gfn + 1); - flush = false; - } - return flush; } static __always_inline bool slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) + bool flush_on_yield) { return slot_handle_level_range(kvm, memslot, fn, start_level, end_level, memslot->base_gfn, memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); + flush_on_yield); } static __always_inline bool @@ -5397,10 +5391,10 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - PG_LEVEL_4K, lock_flush_tlb); + PG_LEVEL_4K, flush_on_yield); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -5625,10 +5619,14 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end); } } @@ -5721,9 +5719,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; + bool flush; write_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); @@ -5735,7 +5736,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, { /* * All current use cases for flushing the TLBs for a specific memslot - * are related to dirty logging, and do the TLB flush out of mmu_lock. + * related to dirty logging, and many do the TLB flush out of mmu_lock. * The interaction between the various operations on memslot must be * serialized by slots_locks to ensure the TLB flush from one operation * is observed by any other operation on the same memslot. -- Gitee From 1b8342101e7d78cbd333e793c589e7f951f15e0b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:42 -0700 Subject: [PATCH 092/158] KVM: x86/mmu: Coalesce TLB flushes when zapping collapsible SPTEs mainline inclusion from mainline-v5.13-rc1 commit 142ccde1f7b1b0c621c299cbcc8feb6353f7cc92 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=142ccde1f7b1b0c621c299cbcc8feb6353f7cc92 ---------------------------------------------------------------------- Gather pending TLB flushes across both the legacy and TDP MMUs when zapping collapsible SPTEs to avoid multiple flushes if both the legacy MMU (for nested guests) and TDP MMU have mappings for the memslot. Note, this also optimizes the TDP MMU to flush only the relevant range when running as L1 with Hyper-V enlightenments. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-4-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/tdp_mmu.h Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 6 ++++-- arch/x86/kvm/mmu/tdp_mmu.c | 8 +++----- arch/x86/kvm/mmu/tdp_mmu.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 59dbf41d0849..e5d9c5ac8c3b 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5723,11 +5723,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + + if (is_tdp_mmu_enabled(kvm)) + flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); + if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 89312e748f0e..fc85cd41ed66 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1367,11 +1367,10 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *slot, bool flush) { struct kvm_mmu_page *root; - bool flush = false; int root_as_id; for_each_tdp_mmu_root_yield_safe(kvm, root) { @@ -1382,8 +1381,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, flush = zap_collapsible_spte_range(kvm, root, slot, flush); } - if (flush) - kvm_flush_remote_tlbs(kvm); + return flush; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index b10e155ffdfd..8e35cedb9812 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -50,8 +50,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn, unsigned long mask, bool wrprot); bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *slot); +bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *slot, bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); -- Gitee From ded1d73faa439e5303d7306432d2cedcb480efc6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:43 -0700 Subject: [PATCH 093/158] KVM: x86/mmu: Coalesce TLB flushes across address spaces for gfn range zap mainline inclusion from mainline-v5.13-rc1 commit 1a61b7db7a8dc44ce5010926ed48b519dda92d84 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1a61b7db7a8dc44ce5010926ed48b519dda92d84 ---------------------------------------------------------------------- Gather pending TLB flushes across both address spaces when zapping a given gfn range. This requires feeding "flush" back into subsequent calls, but on the plus side sets the stage for further batching between the legacy MMU and TDP MMU. It also allows refactoring the address space iteration to cover the legacy and TDP MMUs without introducing truly ugly code. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-5-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/tdp_mmu.h Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e5d9c5ac8c3b..41dc0ac058fe 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5338,10 +5338,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_ static __always_inline bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield) + gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; - bool flush = false; for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { @@ -5370,7 +5370,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, return slot_handle_level_range(kvm, memslot, fn, start_level, end_level, memslot->base_gfn, memslot->base_gfn + memslot->npages - 1, - flush_on_yield); + flush_on_yield, false); } static __always_inline bool @@ -5606,7 +5606,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; - bool flush; + bool flush = false; write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5622,14 +5622,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); - - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end); + start, end - 1, true, flush); } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + if (is_tdp_mmu_enabled(kvm)) { flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); if (flush) -- Gitee From 3bba06de8ae9265472b438b1fcc63c2f7b37112e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:44 -0700 Subject: [PATCH 094/158] KVM: x86/mmu: Pass address space ID to __kvm_tdp_mmu_zap_gfn_range() mainline inclusion from mainline-v5.13-rc1 commit 2b9663d8a19d0a3efd29fd4f5f3e2c4ea88982c7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2b9663d8a19d0a3efd29fd4f5f3e2c4ea88982c7 ---------------------------------------------------------------------- Pass the address space ID to TDP MMU's primary "zap gfn range" helper to allow the MMU notifier paths to iterate over memslots exactly once. Currently, both the legacy MMU and TDP MMU iterate over memslots when looking for an overlapping hva range, which can be quite costly if there are a large number of memslots. Add a "flush" parameter so that iterating over multiple address spaces in the caller will continue to do the right thing when yielding while a flush is pending from a previous address space. Note, this also has a functional change in the form of coalescing TLB flushes across multiple address spaces in kvm_zap_gfn_range(), and also optimizes the TDP MMU to utilize range-based flushing when running as L1 with Hyper-V enlightenments. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-6-seanjc@google.com> [Keep separate for loops to prepare for other incoming patches. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ arch/x86/kvm/mmu/tdp_mmu.c | 17 +++++++++++------ arch/x86/kvm/mmu/tdp_mmu.h | 14 ++++++++------ 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 41dc0ac058fe..32b5458ea43e 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5626,15 +5626,15 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - if (is_tdp_mmu_enabled(kvm)) { - flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); - if (flush) - kvm_flush_remote_tlbs(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, + gfn_end, flush); } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index fc85cd41ed66..e50ea9283214 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -699,14 +699,16 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, - bool can_yield) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, + gfn_t end, bool can_yield, bool flush) { struct kvm_mmu_page *root; - bool flush = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_tdp_mmu_root_yield_safe(kvm, root) { + if (kvm_mmu_page_as_id(root) != as_id) + continue; flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); + } return flush; } @@ -714,9 +716,12 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - bool flush; + bool flush = false; + int i; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush); - flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); if (flush) kvm_flush_remote_tlbs(kvm); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 8e35cedb9812..ac07443b530e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,12 +8,12 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, - bool can_yield); -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, - gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, + gfn_t end, bool can_yield, bool flush); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, + gfn_t start, gfn_t end, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -23,7 +23,9 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) * Don't allow yielding, as the caller may have pending pages to zap * on the shadow MMU. */ - return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), + sp->gfn, end, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); -- Gitee From dcdee4d11b3d6082a262d356393906cefe6eb645 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:45 -0700 Subject: [PATCH 095/158] KVM: x86/mmu: Pass address space ID to TDP MMU root walkers mainline inclusion from mainline-v5.13-rc1 commit a3f15bda46e85c33e55b23aa51dd542453f134e3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a3f15bda46e85c33e55b23aa51dd542453f134e3 ---------------------------------------------------------------------- Move the address space ID check that is performed when iterating over roots into the macro helpers to consolidate code. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-7-seanjc@google.com> Signed-off-by: Paolo Bonzini conflicts: arch/x86/kvm/mmu/tdp_mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 7 +- arch/x86/kvm/mmu/tdp_mmu.c | 111 +++++++++++--------------------- 2 files changed, 45 insertions(+), 73 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5d7ad5f25072..1685c34693bb 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -78,9 +78,14 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) +{ + return role.smm ? 1 : 0; +} + static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) { - return sp->role.smm ? 1 : 0; + return kvm_mmu_role_as_id(sp->role); } static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e50ea9283214..c8d66483a1ef 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -76,14 +76,18 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * if exiting the loop early, the caller must drop the reference to the most * recent root. (Unless keeping a live reference is desirable.) */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ typeof(*_root), link); \ tdp_mmu_next_root_valid(_kvm, _root); \ - _root = tdp_mmu_next_root(_kvm, _root)) + _root = tdp_mmu_next_root(_kvm, _root)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ + } else -#define for_each_tdp_mmu_root(_kvm, _root) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ + } else static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush); @@ -148,7 +152,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); /* Check for an existing root before allocating a new one. */ - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); goto out; @@ -704,11 +708,8 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - if (kvm_mmu_page_as_id(root) != as_id) - continue; + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); - } return flush; } @@ -888,27 +889,28 @@ static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, int ret = 0; int as_id; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - as_id = kvm_mmu_page_as_id(root); - slots = __kvm_memslots(kvm, as_id); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - ret |= handler(kvm, memslot, root, gfn_start, - gfn_end, data); + for (as_id = 0; as_id < KVM_ADDRESS_SPACE_NUM; as_id++) { + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) { + slots = __kvm_memslots(kvm, as_id); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, memslot, root, gfn_start, + gfn_end, data); + } } } @@ -1120,17 +1122,11 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); - } return spte_set; } @@ -1188,17 +1184,11 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - } return spte_set; } @@ -1260,16 +1250,10 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool wrprot) { struct kvm_mmu_page *root; - int root_as_id; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); - } } /* @@ -1312,17 +1296,11 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - } return spte_set; } @@ -1376,15 +1354,9 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *slot, bool flush) { struct kvm_mmu_page *root; - int root_as_id; - - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) flush = zap_collapsible_spte_range(kvm, root, slot, flush); - } return flush; } @@ -1428,17 +1400,12 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn); - } + return spte_set; } -- Gitee From 42277895e56782fcbc35b385764976f908e69eae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:46 -0700 Subject: [PATCH 096/158] KVM: x86/mmu: Use leaf-only loop for walking TDP SPTEs when changing SPTE mainline inclusion from mainline-v5.13-rc1 commit aaaac889cf63a6c2e5f4b20c20cccf5aeb78bd51 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aaaac889cf63a6c2e5f4b20c20cccf5aeb78bd51 ---------------------------------------------------------------------- Use the leaf-only TDP iterator when changing the SPTE in reaction to a MMU notifier. Practically speaking, this is a nop since the guts of the loop explicitly looks for 4k SPTEs, which are always leaf SPTEs. Switch the iterator to match age_gfn_range() and test_age_gfn() so that a future patch can consolidate the core iterating logic. No real functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-8-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c8d66483a1ef..a3a260167f56 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1037,7 +1037,7 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, new_pfn = pte_pfn(*ptep); - tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { if (iter.level != PG_LEVEL_4K) continue; -- Gitee From c736c66e139b25f90d40f6f6b381f71ab4595a7c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:47 -0700 Subject: [PATCH 097/158] KVM: Move prototypes for MMU notifier callbacks to generic code mainline inclusion from mainline-v5.13-rc1 commit 5f7c292b8975c9146063abbb91c0b9cdc1a5e9c5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f7c292b8975c9146063abbb91c0b9cdc1a5e9c5 Also, use the common declaration of kvm_unmap_hva_range() for loongarch, instead of a private one. And change the definition for it. Previously, loongarch's version of kvm_unmap_hva_range() defined "bool blockable" as its last parameter(which is not used). But it is wrong. KVM MMU uses "flag" of struct mmu_notifier_range as the argument when it calls kvm_unmap_hva_range(), the arch code should use if (flag & MMU_NOTIFIER_RANGE_BLOCKABLE) inside kvm_unmap_hva_range() when it wanna know if current operation is a blockable one. So just fix the broken syntax for loongarch. ---------------------------------------------------------------------- Move the prototypes for the MMU notifier callbacks out of arch code and into common code. There is no benefit to having each arch replicate the prototypes since any deviation from the invocation in common code will explode. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-9-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: include/trace/events/kvm.h Signed-off-by: Yu Zhang --- arch/arm64/include/asm/kvm_host.h | 5 ----- arch/loongarch/include/asm/kvm_host.h | 5 ----- arch/loongarch/kvm/mmu.c | 2 +- arch/mips/include/asm/kvm_host.h | 5 ----- arch/powerpc/include/asm/kvm_host.h | 7 ------- arch/x86/include/asm/kvm_host.h | 6 +----- include/linux/kvm_host.h | 8 ++++++++ 7 files changed, 10 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index dc445f074c77..a84343e48b76 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -525,11 +525,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index e036ca0dbe51..a71b5813d64f 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -298,11 +298,6 @@ enum _kvm_fault_result { }; #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, bool blockable); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); static inline void update_pc(struct kvm_vcpu_arch *arch) { diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 4e4c1ddabb5d..cdd16862fb4c 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -712,7 +712,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return npages > 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable) +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { unsigned long npages; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 3a5612e7304c..feaa77036b67 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -967,11 +967,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, bool write); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2b9b6855ec86..4c0d787c2436 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -56,13 +56,6 @@ #define KVM_ARCH_WANT_MMU_NOTIFIER -extern int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, - unsigned flags); -extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33df6a08f7d7..21bb5738e2cc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1726,11 +1726,7 @@ asmlinkage void kvm_spurious_fault(void); _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cf1f65c8c0bf..5e644f9611c2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -218,6 +218,14 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end, unsigned flags); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#endif + enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, -- Gitee From b9f007661d75400c6ea82bc70faff461dc751c45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:48 -0700 Subject: [PATCH 098/158] KVM: Move arm64's MMU notifier trace events to generic code mainline inclusion from mainline-v5.13-rc1 commit 501b918525efec2e701e806f04d474d7da350962 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=501b918525efec2e701e806f04d474d7da350962 ---------------------------------------------------------------------- Move arm64's MMU notifier trace events into common code in preparation for doing the hva->gfn lookup in common code. The alternative would be to trace the gfn instead of hva, but that's not obviously better and could also be done in common code. Tracing the notifiers is also quite handy for debug regardless of architecture. Remove a completely redundant tracepoint from PPC e500. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-10-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/arm64/kvm/mmu.c | 7 +--- arch/arm64/kvm/trace_arm.h | 66 -------------------------------- arch/powerpc/kvm/e500_mmu_host.c | 2 - arch/powerpc/kvm/trace_booke.h | 15 -------- include/trace/events/kvm.h | 66 ++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 10 +++++ 6 files changed, 78 insertions(+), 88 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 23a51f0bab75..2035f4876cab 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1153,7 +1153,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_unmap_hva_range(start, end); handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); return 0; } @@ -1183,8 +1182,6 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_set_spte_hva(hva); - /* * We've moved a page around, probably through CoW, so let's treat it * just like a translation fault and clean the cache to the PoC. @@ -1215,7 +1212,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_age_hva(start, end); + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); } @@ -1223,7 +1220,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_test_age_hva(hva); + return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, kvm_test_age_hva_handler, NULL); } diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index d5edb4cb217b..07960c0ad3e9 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -136,72 +136,6 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); -TRACE_EVENT(kvm_unmap_hva_range, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_set_spte_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) -); - -TRACE_EVENT(kvm_age_hva, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier age hva: %#016lx -- %#016lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_test_age_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) -); - TRACE_EVENT(kvm_set_way_flush, TP_PROTO(unsigned long vcpu_pc, bool cache), TP_ARGS(vcpu_pc, cache), diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index ed0c9c43d0cf..648aefe1a3e7 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -723,8 +723,6 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { - trace_kvm_unmap_hva(hva); - /* * Flush all shadow tlb entries everywhere. This is slow, but * we are 100% sure that we catch the to be unmapped page diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h index 3837842986aa..eff6e82dbcd4 100644 --- a/arch/powerpc/kvm/trace_booke.h +++ b/arch/powerpc/kvm/trace_booke.h @@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit, ) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - TRACE_EVENT(kvm_booke206_stlb_write, TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 26cfb0fa8e7e..98ddfd49f421 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -399,6 +399,72 @@ TRACE_EVENT(kvm_halt_poll_ns, #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(false, vcpu_id, new, old) +TRACE_EVENT(kvm_unmap_hva_range, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_set_spte_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) +); + +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#016lx -- %#016lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3c34d4b0a961..9dc85c23316d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -517,6 +517,8 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; + trace_kvm_set_spte_hva(address); + idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); @@ -536,6 +538,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; + trace_kvm_unmap_hva_range(range->start, range->end); + idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); /* @@ -607,6 +611,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; + trace_kvm_age_hva(start, end); + idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); @@ -628,6 +634,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; + trace_kvm_age_hva(start, end); + idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); /* @@ -657,6 +665,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; + trace_kvm_test_age_hva(address); + idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); young = kvm_test_age_hva(kvm, address); -- Gitee From 36d501512ddbb77aa14c074cab82be3ae777b1b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:57 -0700 Subject: [PATCH 099/158] KVM: x86/mmu: Drop trace_kvm_age_page() tracepoint mainline inclusion from mainline-v5.13-rc1 commit 6dfbd6b5d5de19bad36f44710359200f21191134 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6dfbd6b5d5de19bad36f44710359200f21191134 ---------------------------------------------------------------------- Remove x86's trace_kvm_age_page() tracepoint. It's mostly redundant with the common trace_kvm_age_hva() tracepoint, and if there is a need for the extra details, e.g. gfn, referenced, etc... those details should be added to the common tracepoint so that all architectures and MMUs benefit from the info. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-19-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 1 - arch/x86/kvm/mmu/tdp_mmu.c | 2 -- include/trace/events/kvm.h | 24 ------------------------ 3 files changed, 27 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 32b5458ea43e..b81ccb6c0ceb 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1525,7 +1525,6 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) young |= mmu_spte_age(sptep); - trace_kvm_age_page(gfn, level, slot, young); return young; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a3a260167f56..42c8076bbfba 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -981,8 +981,6 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); young = 1; - - trace_kvm_age_page(iter.gfn, iter.level, slot, young); } rcu_read_unlock(); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 98ddfd49f421..279d0b7af42b 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -255,30 +255,6 @@ TRACE_EVENT(kvm_fpu, TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) ); -TRACE_EVENT(kvm_age_page, - TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref), - TP_ARGS(gfn, level, slot, ref), - - TP_STRUCT__entry( - __field( u64, hva ) - __field( u64, gfn ) - __field( u8, level ) - __field( u8, referenced ) - ), - - TP_fast_assign( - __entry->gfn = gfn; - __entry->level = level; - __entry->hva = ((gfn - slot->base_gfn) << - PAGE_SHIFT) + slot->userspace_addr; - __entry->referenced = ref; - ), - - TP_printk("hva %llx gfn %llx level %u %s", - __entry->hva, __entry->gfn, __entry->level, - __entry->referenced ? "YOUNG" : "OLD") -); - #ifdef CONFIG_KVM_ASYNC_PF DECLARE_EVENT_CLASS(kvm_async_get_page_class, -- Gitee From d99ec31469a1af089d2f5a331f936d317a745443 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Mar 2021 17:49:41 -0700 Subject: [PATCH 100/158] KVM: x86/mmu: Remove spurious clearing of dirty bit from TDP MMU SPTE mainline inclusion from mainline-v5.13-rc1 commit 6d9aafb96d5f665d038229e7561388c38430b3a8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d9aafb96d5f665d038229e7561388c38430b3a8 ---------------------------------------------------------------------- Don't clear the dirty bit when aging a TDP MMU SPTE (in response to a MMU notifier event). Prematurely clearing the dirty bit could cause spurious PML updates if aging a page happened to coincide with dirty logging. Note, tdp_mmu_set_spte_no_acc_track() flows into __handle_changed_spte(), so the host PFN will be marked dirty, i.e. there is no potential for data corruption. Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210331004942.2444916-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 42c8076bbfba..9ac2ee61b0f3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -977,7 +977,6 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, new_spte = mark_spte_for_access_track(new_spte); } - new_spte &= ~shadow_dirty_mask; tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); young = 1; -- Gitee From 91e9c4675efc122de2d092973b68186fd62572ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Mar 2021 17:49:42 -0700 Subject: [PATCH 101/158] KVM: x86/mmu: Simplify code for aging SPTEs in TDP MMU mainline inclusion from mainline-v5.13-rc1 commit 8f8f52a45d928d638c7ffbd081de85c692b28964 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8f8f52a45d928d638c7ffbd081de85c692b28964 ---------------------------------------------------------------------- Use a basic NOT+AND sequence to clear the Accessed bit in TDP MMU SPTEs, as opposed to the fancy ffs()+clear_bit() logic that was copied from the legacy MMU. The legacy MMU uses clear_bit() because it is operating on the SPTE itself, i.e. clearing needs to be atomic. The TDP MMU operates on a local variable that it later writes to the SPTE, and so doesn't need to be atomic or even resident in memory. Opportunistically drop unnecessary initialization of new_spte, it's guaranteed to be written before being accessed. Using NOT+AND instead of ffs()+clear_bit() reduces the sequence from: 0x0000000000058be6 <+134>: test %rax,%rax 0x0000000000058be9 <+137>: je 0x58bf4 0x0000000000058beb <+139>: test %rax,%rdi 0x0000000000058bee <+142>: je 0x58cdc 0x0000000000058bf4 <+148>: mov %rdi,0x8(%rsp) 0x0000000000058bf9 <+153>: mov $0xffffffff,%edx 0x0000000000058bfe <+158>: bsf %eax,%edx 0x0000000000058c01 <+161>: movslq %edx,%rdx 0x0000000000058c04 <+164>: lock btr %rdx,0x8(%rsp) 0x0000000000058c0b <+171>: mov 0x8(%rsp),%r15 to: 0x0000000000058bdd <+125>: test %rax,%rax 0x0000000000058be0 <+128>: je 0x58beb 0x0000000000058be2 <+130>: test %rax,%r8 0x0000000000058be5 <+133>: je 0x58cc0 0x0000000000058beb <+139>: not %rax 0x0000000000058bee <+142>: and %r8,%rax 0x0000000000058bf1 <+145>: mov %rax,%r15 thus eliminating several memory accesses, including a locked access. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210331004942.2444916-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 9ac2ee61b0f3..c69f5d7d1fd4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -950,7 +950,7 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, { struct tdp_iter iter; int young = 0; - u64 new_spte = 0; + u64 new_spte; rcu_read_lock(); @@ -965,8 +965,7 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, new_spte = iter.old_spte; if (spte_ad_enabled(new_spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)&new_spte); + new_spte &= ~shadow_accessed_mask; } else { /* * Capture the dirty status of the page, so that it doesn't get -- Gitee From 701fea7248e4096ca22c413f28c8f7a190b5e319 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 Mar 2021 09:38:16 +0200 Subject: [PATCH 102/158] KVM: MIPS: rework flush_shadow_* callbacks into one that prepares the flush mainline inclusion from mainline-v5.13-rc1 commit 5194552fb1ffb4248c8db3f7286aa4ac7ae3163b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5194552fb1ffb4248c8db3f7286aa4ac7ae3163b ---------------------------------------------------------------------- Both trap-and-emulate and VZ have a single implementation that covers both .flush_shadow_all and .flush_shadow_memslot, and both of them end with a call to kvm_flush_remote_tlbs. Unify the callbacks into one and extract the call to kvm_flush_remote_tlbs. The next patches will pull it further out of the the architecture-specific MMU notifier functions kvm_unmap_hva_range and kvm_set_spte_hva. Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/mips/include/asm/kvm_host.h | 9 +-------- arch/mips/kvm/mips.c | 12 ++++++------ arch/mips/kvm/mmu.c | 9 ++++++--- arch/mips/kvm/trap_emul.c | 13 ++----------- arch/mips/kvm/vz.c | 19 ++++--------------- 5 files changed, 19 insertions(+), 43 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index feaa77036b67..6c8c0ab53be2 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -815,14 +815,7 @@ struct kvm_mips_callbacks { int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); - void (*flush_shadow_all)(struct kvm *kvm); - /* - * Must take care of flushing any cached GPA PTEs (e.g. guest entries in - * VZ root TLB, or T&E GVA page tables and corresponding root TLB - * mappings). - */ - void (*flush_shadow_memslot)(struct kvm *kvm, - const struct kvm_memory_slot *slot); + void (*prepare_flush_shadow)(struct kvm *kvm); gpa_t (*gva_to_gpa)(gva_t gva); void (*queue_timer_int)(struct kvm_vcpu *vcpu); void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3d6a7f5827b1..37e5ed316f66 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -206,7 +206,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_mips_flush_gpa_pt(kvm, 0, ~0); /* Let implementation do the rest */ - kvm_mips_callbacks->flush_shadow_all(kvm); + kvm_mips_callbacks->prepare_flush_shadow(kvm); + kvm_flush_remote_tlbs(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -221,8 +222,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, /* Flush slot from GPA */ kvm_mips_flush_gpa_pt(kvm, slot->base_gfn, slot->base_gfn + slot->npages - 1); - /* Let implementation do the rest */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, slot); + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); spin_unlock(&kvm->mmu_lock); } @@ -262,9 +262,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, /* Write protect GPA page table entries */ needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn, new->base_gfn + new->npages - 1); - /* Let implementation do the rest */ if (needs_flush) - kvm_mips_callbacks->flush_shadow_memslot(kvm, new); + kvm_arch_flush_remote_tlbs_memslot(kvm, new); spin_unlock(&kvm->mmu_lock); } } @@ -1000,7 +999,8 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { /* Let implementation handle TLB/GVA invalidation */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + kvm_mips_callbacks->prepare_flush_shadow(kvm); + kvm_flush_remote_tlbs(kvm); } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index c0c5f39c3434..1a807aafb57c 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -490,7 +490,8 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, { handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - kvm_mips_callbacks->flush_shadow_all(kvm); + kvm_mips_callbacks->prepare_flush_shadow(kvm); + kvm_flush_remote_tlbs(kvm); return 0; } @@ -531,8 +532,10 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) int ret; ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); - if (ret) - kvm_mips_callbacks->flush_shadow_all(kvm); + if (ret) { + kvm_mips_callbacks->prepare_flush_shadow(kvm); + kvm_flush_remote_tlbs(kvm); + } return 0; } diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 0788c00d7e94..5f2df497599c 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -687,16 +687,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm) +static void kvm_trap_emul_prepare_flush_shadow(struct kvm *kvm) { - /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */ - kvm_flush_remote_tlbs(kvm); -} - -static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm, - const struct kvm_memory_slot *slot) -{ - kvm_trap_emul_flush_shadow_all(kvm); } static u64 kvm_trap_emul_get_one_regs[] = { @@ -1280,8 +1272,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .vcpu_init = kvm_trap_emul_vcpu_init, .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, - .flush_shadow_all = kvm_trap_emul_flush_shadow_all, - .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot, + .prepare_flush_shadow = kvm_trap_emul_prepare_flush_shadow, .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb, .queue_timer_int = kvm_mips_queue_timer_int_cb, .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb, diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 2ffbe9264a31..2c75571dc4a2 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3211,32 +3211,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -static void kvm_vz_flush_shadow_all(struct kvm *kvm) +static void kvm_vz_prepare_flush_shadow(struct kvm *kvm) { - if (cpu_has_guestid) { - /* Flush GuestID for each VCPU individually */ - kvm_flush_remote_tlbs(kvm); - } else { + if (!cpu_has_guestid) { /* * For each CPU there is a single GPA ASID used by all VCPUs in * the VM, so it doesn't make sense for the VCPUs to handle * invalidation of these ASIDs individually. * * Instead mark all CPUs as needing ASID invalidation in - * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to + * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will * kick any running VCPUs so they check asid_flush_mask. */ cpumask_setall(&kvm->arch.asid_flush_mask); - kvm_flush_remote_tlbs(kvm); } } -static void kvm_vz_flush_shadow_memslot(struct kvm *kvm, - const struct kvm_memory_slot *slot) -{ - kvm_vz_flush_shadow_all(kvm); -} - static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -3292,8 +3282,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .vcpu_init = kvm_vz_vcpu_init, .vcpu_uninit = kvm_vz_vcpu_uninit, .vcpu_setup = kvm_vz_vcpu_setup, - .flush_shadow_all = kvm_vz_flush_shadow_all, - .flush_shadow_memslot = kvm_vz_flush_shadow_memslot, + .prepare_flush_shadow = kvm_vz_prepare_flush_shadow, .gva_to_gpa = kvm_vz_gva_to_gpa_cb, .queue_timer_int = kvm_vz_queue_timer_int_cb, .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb, -- Gitee From d42a2911337ba573c50e15486c37ac0aeaf2e5ad Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 2 Apr 2021 11:44:56 +0200 Subject: [PATCH 103/158] KVM: MIPS: let generic code call prepare_flush_shadow mainline inclusion from mainline-v5.13-rc1 commit 566a0beef52c83f13b67aef02b2bc0aa63c0e0d6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=566a0beef52c83f13b67aef02b2bc0aa63c0e0d6 ---------------------------------------------------------------------- Since all calls to kvm_flush_remote_tlbs must be preceded by kvm_mips_callbacks->prepare_flush_shadow, repurpose kvm_arch_flush_remote_tlb to invoke it. This makes it possible to use the TLB flushing mechanism provided by the generic MMU notifier code. Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/mips/include/asm/kvm_host.h | 3 +++ arch/mips/kvm/mips.c | 11 ++++++----- arch/mips/kvm/mmu.c | 6 +----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6c8c0ab53be2..d0944a75fc8d 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -1142,4 +1142,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +int kvm_arch_flush_remote_tlb(struct kvm *kvm); + #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 37e5ed316f66..730b9ef0fb9a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -204,9 +204,6 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) { /* Flush whole GPA */ kvm_mips_flush_gpa_pt(kvm, 0, ~0); - - /* Let implementation do the rest */ - kvm_mips_callbacks->prepare_flush_shadow(kvm); kvm_flush_remote_tlbs(kvm); } @@ -995,11 +992,15 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) } +int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + kvm_mips_callbacks->prepare_flush_shadow(kvm); + return 1; +} + void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { - /* Let implementation handle TLB/GVA invalidation */ - kvm_mips_callbacks->prepare_flush_shadow(kvm); kvm_flush_remote_tlbs(kvm); } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 1a807aafb57c..1ed0bfe6f7a5 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -489,8 +489,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - - kvm_mips_callbacks->prepare_flush_shadow(kvm); kvm_flush_remote_tlbs(kvm); return 0; } @@ -532,10 +530,8 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) int ret; ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); - if (ret) { - kvm_mips_callbacks->prepare_flush_shadow(kvm); + if (ret) kvm_flush_remote_tlbs(kvm); - } return 0; } -- Gitee From f03440ac50993730ccb6ade2a062fd86a0067021 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 Mar 2021 09:40:39 +0200 Subject: [PATCH 104/158] KVM: MIPS: defer flush to generic MMU notifier code mainline inclusion from mainline-v5.13-rc1 commit fe9a5b055116dff7fcee081abc2def4b14d24c21 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fe9a5b055116dff7fcee081abc2def4b14d24c21 ---------------------------------------------------------------------- Return 1 from kvm_unmap_hva_range and kvm_set_spte_hva if a flush is needed, so that the generic code can coalesce the flushes. Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/mips/kvm/mmu.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 1ed0bfe6f7a5..f519269cc187 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -488,9 +488,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - kvm_flush_remote_tlbs(kvm); - return 0; + return handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); } static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, @@ -527,12 +525,7 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; - int ret; - - ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); - if (ret) - kvm_flush_remote_tlbs(kvm); - return 0; + return handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); } static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, -- Gitee From 11fd0128e38943670fa1998474dcd159dae356f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:50 -0700 Subject: [PATCH 105/158] KVM: Move x86's MMU notifier memslot walkers to generic code mainline inclusion from mainline-v5.13-rc1 commit 3039bcc744980afe87c612122e47a27306483bc2 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3039bcc744980afe87c612122e47a27306483bc2 ---------------------------------------------------------------------- Move the hva->gfn lookup for MMU notifiers into common code. Every arch does a similar lookup, and some arch code is all but identical across multiple architectures. In addition to consolidating code, this will allow introducing optimizations that will benefit all architectures without incurring multiple walks of the memslots, e.g. by taking mmu_lock if and only if a relevant range exists in the memslots. The use of __always_inline to avoid indirect call retpolines, as done by x86, may also benefit other architectures. Consolidating the lookups also fixes a wart in x86, where the legacy MMU and TDP MMU each do their own memslot walks. Lastly, future enhancements to the memslot implementation, e.g. to add an interval tree to track host address, will need to touch far less arch specific code. MIPS, PPC, and arm64 will be converted one at a time in future patches. Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-3-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: virt/kvm/kvm_main.c Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 127 +++++++---------- arch/x86/kvm/mmu/tdp_mmu.c | 235 +++++++++++--------------------- arch/x86/kvm/mmu/tdp_mmu.h | 14 +- include/linux/kvm_host.h | 14 ++ virt/kvm/kvm_main.c | 171 ++++++++++++++++++++++- 6 files changed, 315 insertions(+), 247 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 21bb5738e2cc..b545d6ebc990 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1726,6 +1726,7 @@ asmlinkage void kvm_spurious_fault(void); _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b81ccb6c0ceb..5ea95d8c8512 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1321,26 +1321,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return flush; } -static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { return kvm_zap_rmapp(kvm, rmap_head, slot); } -static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; int need_flush = 0; u64 new_spte; - pte_t *ptep = (pte_t *)data; kvm_pfn_t new_pfn; - WARN_ON(pte_huge(*ptep)); - new_pfn = pte_pfn(*ptep); + WARN_ON(pte_huge(pte)); + new_pfn = pte_pfn(pte); restart: for_each_rmap_spte(rmap_head, &iter, sptep) { @@ -1349,7 +1348,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, need_flush = 1; - if (pte_write(*ptep)) { + if (pte_write(pte)) { pte_list_remove(rmap_head, sptep); goto restart; } else { @@ -1437,86 +1436,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -typedef int (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, unsigned long data); +typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t pte); -static __always_inline int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - rmap_handler_t handler) +static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + rmap_handler_t handler) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; struct slot_rmap_walk_iterator iterator; - int ret = 0; - int i; - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; + bool ret = false; - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for_each_slot_rmap_range(memslot, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - gfn_start, gfn_end - 1, - &iterator) - ret |= handler(kvm, iterator.rmap, memslot, - iterator.gfn, iterator.level, data); - } - } + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + range->start, range->end - 1, &iterator) + ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, + iterator.level, range->pte); return ret; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, rmap_handler_t handler) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); -} - -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) -{ - int r; + bool flush; - r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); - return r; + return flush; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int r; + bool flush; - r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); if (is_tdp_mmu_enabled(kvm)) - r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); - return r; + return flush; } -static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1528,9 +1493,9 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, unsigned long data) +static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1552,29 +1517,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int young = false; + bool young; + + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); - young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); if (is_tdp_mmu_enabled(kvm)) - young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int young = false; + bool young; + + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); - young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); if (is_tdp_mmu_enabled(kvm)) - young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c69f5d7d1fd4..ce9fc229e225 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -873,204 +873,135 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return ret; } -typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, gfn_t end, - unsigned long data); - -static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - tdp_handler_t handler) +bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; struct kvm_mmu_page *root; - int ret = 0; - int as_id; - - for (as_id = 0; as_id < KVM_ADDRESS_SPACE_NUM; as_id++) { - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) { - slots = __kvm_memslots(kvm, as_id); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - ret |= handler(kvm, memslot, root, gfn_start, - gfn_end, data); - } - } - } - return ret; -} + for_each_tdp_mmu_root(kvm, root, range->slot->as_id) + flush |= zap_gfn_range(kvm, root, range->start, range->end, + false, flush); -static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm, - unsigned long addr, - unsigned long data, - tdp_handler_t handler) -{ - return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler); + return flush; } -static int zap_gfn_range_hva_wrapper(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long unused) -{ - return zap_gfn_range(kvm, root, start, end, false, false); -} +typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range); -int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end) +static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, + struct kvm_gfn_range *range, + tdp_handler_t handler) { - return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, - zap_gfn_range_hva_wrapper); + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; + + rcu_read_lock(); + + for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) + ret |= handler(kvm, &iter, range); + } + + rcu_read_unlock(); + + return ret; } /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. */ -static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, gfn_t end, - unsigned long unused) +static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - int young = 0; - u64 new_spte; + u64 new_spte = 0; - rcu_read_lock(); + /* If we have a non-accessed entry we don't need to change the pte. */ + if (!is_accessed_spte(iter->old_spte)) + return false; - tdp_root_for_each_leaf_pte(iter, root, start, end) { + new_spte = iter->old_spte; + + if (spte_ad_enabled(new_spte)) { + new_spte &= ~shadow_accessed_mask; + } else { /* - * If we have a non-accessed entry we don't need to change the - * pte. + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. */ - if (!is_accessed_spte(iter.old_spte)) - continue; + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); - new_spte = iter.old_spte; - - if (spte_ad_enabled(new_spte)) { - new_spte &= ~shadow_accessed_mask; - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); - - new_spte = mark_spte_for_access_track(new_spte); - } - - tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); - young = 1; + new_spte = mark_spte_for_access_track(new_spte); } - rcu_read_unlock(); + tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); - return young; + return true; } -int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, - age_gfn_range); + return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); } -static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t end, - unsigned long unused) +static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - - tdp_root_for_each_leaf_pte(iter, root, gfn, end) - if (is_accessed_spte(iter.old_spte)) - return 1; - - return 0; + return is_accessed_spte(iter->old_spte); } -int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn); + return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); } -/* - * Handle the changed_pte MMU notifier for the TDP MMU. - * data is a pointer to the new pte_t mapping the HVA specified by the MMU - * notifier. - * Returns non-zero if a flush is needed before releasing the MMU lock. - */ -static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t end, - unsigned long data) +static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - pte_t *ptep = (pte_t *)data; - kvm_pfn_t new_pfn; u64 new_spte; - int need_flush = 0; - - rcu_read_lock(); - WARN_ON(pte_huge(*ptep) || (gfn + 1) != end); + /* Huge pages aren't expected to be modified without first being zapped. */ + WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); - new_pfn = pte_pfn(*ptep); - - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { - if (iter.level != PG_LEVEL_4K) - continue; - - if (!is_shadow_present_pte(iter.old_spte)) - break; - - /* - * Note, when changing a read-only SPTE, it's not strictly - * necessary to zero the SPTE before setting the new PFN, but - * doing so preserves the invariant that the PFN of a present - * leaf SPTE can never change. See __handle_changed_spte(). - */ - tdp_mmu_set_spte(kvm, &iter, 0); + if (iter->level != PG_LEVEL_4K || + !is_shadow_present_pte(iter->old_spte)) + return false; - if (!pte_write(*ptep)) { - new_spte = kvm_mmu_changed_pte_notifier_make_spte( - iter.old_spte, new_pfn); + /* + * Note, when changing a read-only SPTE, it's not strictly necessary to + * zero the SPTE before setting the new PFN, but doing so preserves the + * invariant that the PFN of a present * leaf SPTE can never change. + * See __handle_changed_spte(). + */ + tdp_mmu_set_spte(kvm, iter, 0); - tdp_mmu_set_spte(kvm, &iter, new_spte); - } + if (!pte_write(range->pte)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, + pte_pfn(range->pte)); - need_flush = 1; + tdp_mmu_set_spte(kvm, iter, new_spte); } - if (need_flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - - rcu_read_unlock(); - - return 0; + return true; } -int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, - pte_t *host_ptep) +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep, - set_tdp_spte); + bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); + + /* FIXME: return 'flush' instead of flushing here. */ + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); + + return false; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index ac07443b530e..76c9d613758f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -33,15 +33,11 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, bool prefault); -int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end); - -int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end); -int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); - -int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, - pte_t *host_ptep); +bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush); +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int min_level); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5e644f9611c2..349e608b0608 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -219,11 +219,25 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS +struct kvm_gfn_range { + struct kvm_memory_slot *slot; + gfn_t start; + gfn_t end; + pte_t pte; + bool may_block; +}; +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +#else int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags); int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ #endif enum { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9dc85c23316d..e297e6a6460d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -509,16 +509,136 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + +typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); + +struct kvm_hva_range { + unsigned long start; + unsigned long end; + pte_t pte; + hva_handler_t handler; + bool flush_on_ret; + bool may_block; +}; + +static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_hva_range *range) +{ + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + struct kvm_gfn_range gfn_range; + bool ret = false; + int i, idx; + + lockdep_assert_held_write(&kvm->mmu_lock); + + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + unsigned long hva_start, hva_end; + + hva_start = max(range->start, slot->userspace_addr); + hva_end = min(range->end, slot->userspace_addr + + (slot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + /* + * To optimize for the likely case where the address + * range is covered by zero or one memslots, don't + * bother making these conditional (to avoid writes on + * the second or later invocation of the handler). + */ + gfn_range.pte = range->pte; + gfn_range.may_block = range->may_block; + + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_range.start = hva_to_gfn_memslot(hva_start, slot); + gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); + gfn_range.slot = slot; + + ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) + kvm_flush_remote_tlbs(kvm); + + srcu_read_unlock(&kvm->srcu, idx); + + /* The notifiers are averse to booleans. :-( */ + return (int)ret; +} + +static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + pte_t pte, + hva_handler_t handler) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range range = { + .start = start, + .end = end, + .pte = pte, + .handler = handler, + .flush_on_ret = true, + .may_block = false, + }; + int ret; + + KVM_MMU_LOCK(kvm); + ret = __kvm_handle_hva_range(kvm, &range); + KVM_MMU_UNLOCK(kvm); + + return ret; +} + +static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + hva_handler_t handler) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + const struct kvm_hva_range range = { + .start = start, + .end = end, + .pte = __pte(0), + .handler = handler, + .flush_on_ret = false, + .may_block = false, + }; + int ret; + + KVM_MMU_LOCK(kvm); + ret = __kvm_handle_hva_range(kvm, &range); + KVM_MMU_UNLOCK(kvm); + + return ret; +} +#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, pte_t pte) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS int idx; - + struct kvm *kvm = mmu_notifier_to_kvm(mn); +#endif trace_kvm_set_spte_hva(address); + +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); +#else idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); @@ -530,17 +650,32 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); +#endif } static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + const struct kvm_hva_range hva_range = { + .start = range->start, + .end = range->end, + .pte = __pte(0), + .handler = kvm_unmap_gfn_range, + .flush_on_ret = true, + .may_block = mmu_notifier_range_blockable(range), + }; +#else int need_tlb_flush = 0, idx; +#endif trace_kvm_unmap_hva_range(range->start, range->end); +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS idx = srcu_read_lock(&kvm->srcu); +#endif + KVM_MMU_LOCK(kvm); /* * The count increase must become visible at unlock time as no @@ -566,15 +701,22 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_range_end = max(kvm->mmu_notifier_range_end, range->end); } + +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + __kvm_handle_hva_range(kvm, &hva_range); +#else need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush || kvm->tlbs_dirty) kvm_flush_remote_tlbs(kvm); +#endif KVM_MMU_UNLOCK(kvm); kvm_arch_guest_memory_reclaimed(kvm); +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS srcu_read_unlock(&kvm->srcu, idx); +#endif return 0; } @@ -608,11 +750,15 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; - +#endif trace_kvm_age_hva(start, end); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn); +#else idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); @@ -624,6 +770,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); return young; +#endif } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -631,13 +778,13 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, unsigned long start, unsigned long end) { +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; +#endif trace_kvm_age_hva(start, end); - idx = srcu_read_lock(&kvm->srcu); - KVM_MMU_LOCK(kvm); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is @@ -651,22 +798,33 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); +#else + idx = srcu_read_lock(&kvm->srcu); + KVM_MMU_LOCK(kvm); young = kvm_age_hva(kvm, start, end); KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; +#endif } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { +#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; - +#endif trace_kvm_test_age_hva(address); +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS + return kvm_handle_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); +#else idx = srcu_read_lock(&kvm->srcu); KVM_MMU_LOCK(kvm); young = kvm_test_age_hva(kvm, address); @@ -674,6 +832,7 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); return young; +#endif } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, -- Gitee From 255fb73d354607ed0e5ccec39950b0201ec05d9b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:51 -0700 Subject: [PATCH 106/158] KVM: arm64: Convert to the gfn-based MMU notifier callbacks mainline inclusion from mainline-v5.13-rc1 commit cd4c7183522817087278b3e000b8a888f9f91b5c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cd4c7183522817087278b3e000b8a888f9f91b5c ---------------------------------------------------------------------- Move arm64 to the gfn-base MMU notifier APIs, which do the hva->gfn lookup in common code. No meaningful functional change intended, though the exact order of operations is slightly different since the memslot lookups occur before calling into arch code. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/mmu.c | 115 ++++++++---------------------- 2 files changed, 32 insertions(+), 84 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a84343e48b76..ae068c02a0c4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -525,6 +525,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 2035f4876cab..6fa92a1435a5 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -872,7 +872,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk * the page we just got a reference to gets unmapped before we have a * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_hva will take it away + * unmapped afterwards, the call to kvm_unmap_gfn will take it away * from us again properly. This smp_rmb() interacts with the smp_wmb() * in kvm_mmu_notifier_invalidate_. */ @@ -1106,123 +1106,70 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) return ret; } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} + if (!kvm->arch.mmu.pgt) + return 0; -static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - unsigned flags = *(unsigned *)data; - bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE; + __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); - __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { + kvm_pfn_t pfn = pte_pfn(range->pte); + if (!kvm->arch.mmu.pgt) return 0; - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - kvm_pfn_t *pfn = (kvm_pfn_t *)data; + WARN_ON(range->end - range->start != 1); - WARN_ON(size != PAGE_SIZE); + /* + * We've moved a page around, probably through CoW, so let's treat it + * just like a translation fault and clean the cache to the PoC. + */ + clean_dcache_guest_page(pfn, PAGE_SIZE); /* * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and * therefore we never need to clear out a huge PMD through this * calling path and a memcache is not required. */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE, - __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL); + kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, + PAGE_SIZE, __pfn_to_phys(pfn), + KVM_PGTABLE_PROT_R, NULL); + return 0; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); + u64 size = (range->end - range->start) << PAGE_SHIFT; + kvm_pte_t kpte; + pte_t pte; if (!kvm->arch.mmu.pgt) return 0; - /* - * We've moved a page around, probably through CoW, so let's treat it - * just like a translation fault and clean the cache to the PoC. - */ - clean_dcache_guest_page(pfn, PAGE_SIZE); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn); - return 0; -} - -static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - pte_t pte; - kvm_pte_t kpte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa); + + kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT); pte = __pte(kpte); return pte_valid(pte) && pte_young(pte); } -static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa); -} - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - if (!kvm->arch.mmu.pgt) - return 0; - - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) return 0; - return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, - kvm_test_age_hva_handler, NULL); + return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT); } phys_addr_t kvm_mmu_get_httbr(void) -- Gitee From 18a8fb4fe195cdee109a0c59fde8249ba4027c37 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:52 -0700 Subject: [PATCH 107/158] KVM: MIPS/MMU: Convert to the gfn-based MMU notifier callbacks mainline inclusion from mainline-v5.13-rc1 commit d923ff258423b7c30e257d7adcd791f845e1b5fb category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d923ff258423b7c30e257d7adcd791f845e1b5fb ---------------------------------------------------------------------- Move MIPS to the gfn-based MMU notifier APIs, which do the hva->gfn lookup in common code, and whose code is nearly identical to MIPS' lookup. No meaningful functional change intended, though the exact order of operations is slightly different since the memslot lookups occur before calling into arch code. Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-5-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/mips/include/asm/kvm_host.h | 1 + arch/mips/kvm/mmu.c | 92 +++++--------------------------- 2 files changed, 14 insertions(+), 79 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index d0944a75fc8d..b70645670548 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -960,6 +960,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, bool write); #define KVM_ARCH_WANT_MMU_NOTIFIER +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index f519269cc187..a5ecefb9729b 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -438,82 +438,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, end_gfn << PAGE_SHIFT); } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, gfn_t gfn, - gpa_t gfn_end, - struct kvm_memory_slot *memslot, - void *data), - void *data) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - ret |= handler(kvm, gfn, gfn_end, memslot, data); - } - - return ret; -} - - -static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) -{ - kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end); + kvm_mips_flush_gpa_pt(kvm, range->start, range->end); return 1; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) -{ - return handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); -} - -static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - gpa_t gpa = gfn << PAGE_SHIFT; - pte_t hva_pte = *(pte_t *)data; + gpa_t gpa = range->start << PAGE_SHIFT; + pte_t hva_pte = range->pte; pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); pte_t old_pte; if (!gpa_pte) - return 0; + return false; /* Mapping may need adjusting depending on memslot flags */ old_pte = *gpa_pte; - if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) + if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) hva_pte = pte_mkclean(hva_pte); - else if (memslot->flags & KVM_MEM_READONLY) + else if (range->slot->flags & KVM_MEM_READONLY) hva_pte = pte_wrprotect(hva_pte); set_pte(gpa_pte, hva_pte); /* Replacing an absent or old page doesn't need flushes */ if (!pte_present(old_pte) || !pte_young(old_pte)) - return 0; + return false; /* Pages swapped, aged, moved, or cleaned require flushes */ return !pte_present(hva_pte) || @@ -522,22 +474,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, (pte_dirty(old_pte) && !pte_dirty(hva_pte)); } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - return handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); + return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end); } -static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end); -} - -static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) -{ - gpa_t gpa = gfn << PAGE_SHIFT; + gpa_t gpa = range->start << PAGE_SHIFT; pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); if (!gpa_pte) @@ -545,16 +489,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return pte_young(*gpa_pte); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); -} - /** * _kvm_mips_map_page_fast() - Fast path GPA fault handler. * @vcpu: VCPU pointer. -- Gitee From ba2975f2aa67c0fd2ed0136d48aa1686881132f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:53 -0700 Subject: [PATCH 108/158] KVM: PPC: Convert to the gfn-based MMU notifier callbacks mainline inclusion from mainline-v5.13-rc1 commit b1c5356e873cc4fcbb8f58965e0cd910f3ee37a9 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b1c5356e873cc4fcbb8f58965e0cd910f3ee37a9 ---------------------------------------------------------------------- Move PPC to the gfn-base MMU notifier APIs, and update all 15 bajillion PPC-internal hooks to work with gfns instead of hvas. No meaningful functional change intended, though the exact order of operations is slightly different since the memslot lookups occur before calling into arch code. Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-6-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/powerpc/include/asm/kvm_book3s.h | 12 ++-- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/kvm_ppc.h | 9 ++- arch/powerpc/kvm/book3s.c | 18 +++-- arch/powerpc/kvm/book3s.h | 10 ++- arch/powerpc/kvm/book3s_64_mmu_hv.c | 98 +++++++------------------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 25 +++---- arch/powerpc/kvm/book3s_hv.c | 12 ++-- arch/powerpc/kvm/book3s_pr.c | 56 +++++---------- arch/powerpc/kvm/e500_mmu_host.c | 27 +++---- 10 files changed, 95 insertions(+), 173 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d32ec9ae73bd..6b32ebea8513 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); -extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); -extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); -extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); +extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); extern void kvmppc_radix_flush_memslot(struct kvm *kvm, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4c0d787c2436..54c25bcbcf97 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -55,6 +55,7 @@ #include #define KVM_ARCH_WANT_MMU_NOTIFIER +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 0a056c64c317..5950cbe75b28 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -281,11 +281,10 @@ struct kvmppc_ops { const struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); - int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, - unsigned long end); - int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); - int (*test_age_hva)(struct kvm *kvm, unsigned long hva); - void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); + bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); void (*destroy_vm)(struct kvm *kvm); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 44bf567b6589..2b691f4d1f26 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); + return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->age_hva(kvm, start, end); + return kvm->arch.kvm_ops->age_gfn(kvm, range); } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->test_age_hva(kvm, hva); + return kvm->arch.kvm_ops->test_age_gfn(kvm, range); } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); - return 0; + return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); } int kvmppc_core_init_vm(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 9b6323ec8e60..740e51def5a5 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -9,12 +9,10 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot); -extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, - unsigned long end); -extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, - unsigned long end); -extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); -extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); +extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8e06cd3f759c..dfff8e17cff4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -751,51 +751,6 @@ void kvmppc_rmap_reset(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } -typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); - -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - hva_handler_fn handler) -{ - int ret; - int retval = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for (; gfn < gfn_end; ++gfn) { - ret = handler(kvm, memslot, gfn); - retval |= ret; - } - } - - return retval; -} - -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - hva_handler_fn handler) -{ - return kvm_handle_hva_range(kvm, hva, hva + 1, handler); -} - /* Must be called with both HPTE and rmap locked */ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, struct kvm_memory_slot *memslot, @@ -839,8 +794,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, } } -static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { unsigned long i; __be64 *hptep; @@ -873,16 +828,15 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } - return 0; + return false; } -int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + return kvm_unmap_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva_range(kvm, start, end, handler); - return 0; + return kvm_unmap_rmapp(kvm, range->slot, range->start); } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, @@ -912,8 +866,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, } } -static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; @@ -967,26 +921,26 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, return ret; } -int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + kvm_age_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; - return kvm_handle_hva_range(kvm, start, end, handler); + return kvm_age_rmapp(kvm, range->slot, range->start); } -static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long *hp; - int ret = 1; + bool ret = true; unsigned long *rmapp; rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; if (*rmapp & KVMPPC_RMAP_REFERENCED) - return 1; + return true; lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) @@ -1001,27 +955,27 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, goto out; } while ((i = j) != head); } - ret = 0; + ret = false; out: unlock_rmap(rmapp); return ret; } -int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + kvm_test_age_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; - return kvm_handle_hva(kvm, hva, handler); + return kvm_test_age_rmapp(kvm, range->slot, range->start); } -void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + return kvm_unmap_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva(kvm, hva, handler); + return kvm_unmap_rmapp(kvm, range->slot, range->start); } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index a6be53efc436..58d2da69997f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -995,8 +995,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; @@ -1004,24 +1004,24 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return 0; + return false; } ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return 0; + return false; } /* Called with kvm->mmu_lock held */ -int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; unsigned long old, *rmapp; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) @@ -1037,26 +1037,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, old & PTE_RPN_MASK, 1UL << shift); - ref = 1; + ref = true; } return ref; } /* Called with kvm->mmu_lock held */ -int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) + { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) - ref = 1; + ref = true; return ref; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1d2593238995..2c993c368cf7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4802,7 +4802,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) kvmhv_release_all_nested(kvm); kvmppc_rmap_reset(kvm); kvm->arch.process_table = 0; - /* Mutual exclusion with kvm_unmap_hva_range etc. */ + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 0; spin_unlock(&kvm->mmu_lock); @@ -4824,7 +4824,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) if (err) return err; kvmppc_rmap_reset(kvm); - /* Mutual exclusion with kvm_unmap_hva_range etc. */ + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 1; spin_unlock(&kvm->mmu_lock); @@ -5661,10 +5661,10 @@ static struct kvmppc_ops kvm_ops_hv = { .flush_memslot = kvmppc_core_flush_memslot_hv, .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, .commit_memory_region = kvmppc_core_commit_memory_region_hv, - .unmap_hva_range = kvm_unmap_hva_range_hv, - .age_hva = kvm_age_hva_hv, - .test_age_hva = kvm_test_age_hva_hv, - .set_spte_hva = kvm_set_spte_hva_hv, + .unmap_gfn_range = kvm_unmap_gfn_range_hv, + .age_gfn = kvm_age_gfn_hv, + .test_age_gfn = kvm_test_age_gfn_hv, + .set_spte_gfn = kvm_set_spte_gfn_hv, .free_memslot = kvmppc_core_free_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, .destroy_vm = kvmppc_core_destroy_vm_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b1fefa63e125..893873448794 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) } /************* MMU Notifiers *************/ -static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { long i; struct kvm_vcpu *vcpu; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT); - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - kvm_for_each_vcpu(i, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, - gfn_end << PAGE_SHIFT); - } + return false; } -static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range) { - do_kvm_unmap_hva(kvm, start, end); - - return 0; + return do_kvm_unmap_gfn(kvm, range); } -static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) +static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) +static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* The page will get remapped properly on its next fault */ - do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); + return do_kvm_unmap_gfn(kvm, range); } /*****************************************/ @@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = { .flush_memslot = kvmppc_core_flush_memslot_pr, .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, .commit_memory_region = kvmppc_core_commit_memory_region_pr, - .unmap_hva_range = kvm_unmap_hva_range_pr, - .age_hva = kvm_age_hva_pr, - .test_age_hva = kvm_test_age_hva_pr, - .set_spte_hva = kvm_set_spte_hva_pr, + .unmap_gfn_range = kvm_unmap_gfn_range_pr, + .age_gfn = kvm_age_gfn_pr, + .test_age_gfn = kvm_test_age_gfn_pr, + .set_spte_gfn = kvm_set_spte_gfn_pr, .free_memslot = kvmppc_core_free_memslot_pr, .init_vm = kvmppc_core_init_vm_pr, .destroy_vm = kvmppc_core_destroy_vm_pr, diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 648aefe1a3e7..7f16afc331ef 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -721,43 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, /************* MMU Notifiers *************/ -static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Flush all shadow tlb entries everywhere. This is slow, but * we are 100% sure that we catch the to be unmapped page */ - kvm_flush_remote_tlbs(kvm); - - return 0; + return true; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); - - return 0; + return kvm_e500_mmu_unmap_gfn(kvm, range); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); - return 0; + return kvm_e500_mmu_unmap_gfn(kvm, range); } /*****************************************/ -- Gitee From bd42f5af900f7cbe9e949fb126daeb892ed43454 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 5 May 2021 22:15:09 +1000 Subject: [PATCH 109/158] KVM: PPC: Book3S HV: Fix conversion to gfn-based MMU notifier callbacks mainline inclusion from mainline-v5.13-rc2 commit 34114136f725cbd0c83e7b5a0c8a977976cd82f7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34114136f725cbd0c83e7b5a0c8a977976cd82f7 ---------------------------------------------------------------------- Commit b1c5356e873c ("KVM: PPC: Convert to the gfn-based MMU notifier callbacks") causes unmap_gfn_range and age_gfn callbacks to only work on the first gfn in the range. It also makes the aging callbacks call into both radix and hash aging functions for radix guests. Fix this. Add warnings for the single-gfn calls that have been converted to range callbacks, in case they ever receieve ranges greater than 1. Fixes: b1c5356e873c ("KVM: PPC: Convert to the gfn-based MMU notifier callbacks") Reported-by: Bharata B Rao Tested-by: Bharata B Rao Signed-off-by: Nicholas Piggin Message-Id: <20210505121509.1470207-1-npiggin@gmail.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 46 ++++++++++++++++++-------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 5 ++- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 6b32ebea8513..402963dd17f0 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -210,7 +210,7 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); -extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, +extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index dfff8e17cff4..de295a470f7c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -794,7 +794,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, } } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, +static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { unsigned long i; @@ -828,15 +828,21 @@ static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } - return false; } bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - return kvm_unmap_radix(kvm, range->slot, range->start); + gfn_t gfn; + + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_rmapp(kvm, range->slot, range->start); + } - return kvm_unmap_rmapp(kvm, range->slot, range->start); + return false; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, @@ -923,10 +929,18 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - kvm_age_radix(kvm, range->slot, range->start); + gfn_t gfn; + bool ret = false; - return kvm_age_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_rmapp(kvm, range->slot, gfn); + } + + return ret; } static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -964,18 +978,24 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - kvm_test_age_radix(kvm, range->slot, range->start); + WARN_ON(range->start + 1 != range->end); - return kvm_test_age_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) + return kvm_test_age_radix(kvm, range->slot, range->start); + else + return kvm_test_age_rmapp(kvm, range->slot, range->start); } bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { + WARN_ON(range->start + 1 != range->end); + if (kvm_is_radix(kvm)) - return kvm_unmap_radix(kvm, range->slot, range->start); + kvm_unmap_radix(kvm, range->slot, range->start); + else + kvm_unmap_rmapp(kvm, range->slot, range->start); - return kvm_unmap_rmapp(kvm, range->slot, range->start); + return false; } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 58d2da69997f..e7924664a944 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -995,7 +995,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, +void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { pte_t *ptep; @@ -1004,14 +1004,13 @@ bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return false; + return; } ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return false; } /* Called with kvm->mmu_lock held */ -- Gitee From ef18d479eb7664ebafc4d53e0243214024a36ea1 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Sat, 11 Nov 2023 16:50:57 +0800 Subject: [PATCH 110/158] KVM: RISC-V: Convert to the gfn-based MMU notifier callbacks Intel inclusion category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA ---------------------------------------------------------------------- Align with upstreaming risc-v code, to use the gfn-base MMU notifier APIs, which do the hva->gfn lookup in common code. Note: RISC-V's MMU notifier support was in introduced in upstreaming commit 9955371cc014 ("RISC-V: KVM: Implement MMU notifiers"), which uses the gfn-based MMU notififer in the first place. And with commit b4c5936c47f8 ("KVM: Kill off the old hva-based MMU notifier callbacks") backported, we can feel safe to use the latest, upstreaming aligned implementation now. Also, KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS was used, so that other archs which can not easily use the gfn-based MMU notifiers can continue to use their hva-based code. Signed-off-by: Anup Patel Signed-off-by: Yu Zhang --- arch/riscv/include/asm/kvm_host.h | 6 +- arch/riscv/kvm/mmu.c | 113 ++++++++---------------------- 2 files changed, 30 insertions(+), 89 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 0818705e3c2b..5c68905d3614 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -218,11 +218,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned int flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa, unsigned long vmid); void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 56fda9ef70fd..311b50b36f67 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -412,38 +412,6 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} - void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, @@ -597,94 +565,71 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return ret; } -static int kvm_unmap_hva_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) -{ - unsigned int flags = *(unsigned int *)data; - bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE; - - stage2_unmap_range(kvm, gpa, size, may_block); - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned int flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.pgd) return 0; - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) -{ - pte_t *pte = (pte_t *)data; - - WARN_ON(size != PAGE_SIZE); - stage2_set_pte(kvm, 0, NULL, gpa, pte); - + stage2_unmap_range(kvm, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); return 0; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; + int ret; + kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.pgd) return 0; - stage2_pte = pfn_pte(pfn, PAGE_WRITE_EXEC); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + WARN_ON(range->end - range->start != 1); + + ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT, + __pfn_to_phys(pfn), PAGE_SIZE, true, true); + if (ret) { + kvm_debug("Failed to map stage2 page (error %d)\n", ret); + return 1; + } return 0; } -static int kvm_age_hva_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { pte_t *ptep; u32 ptep_level = 0; + u64 size = (range->end - range->start) << PAGE_SHIFT; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); - - if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level)) + if (!kvm->arch.pgd) return 0; - return ptep_test_and_clear_young(NULL, 0, ptep); -} + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - if (!kvm->arch.pgd) + if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return 0; - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); + return ptep_test_and_clear_young(NULL, 0, ptep); } -static int kvm_test_age_hva_handler(struct kvm *kvm, - gpa_t gpa, u64 size, void *data) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { pte_t *ptep; u32 ptep_level = 0; + u64 size = (range->end - range->start) << PAGE_SHIFT; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level)) + if (!kvm->arch.pgd) return 0; - return pte_young(*ptep); -} + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - if (!kvm->arch.pgd) + if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return 0; - return handle_hva_to_gpa(kvm, hva, hva, - kvm_test_age_hva_handler, NULL); + return pte_young(*ptep); } int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, -- Gitee From ac63cf0a8e38a3569ab927f34e700d3a0d178336 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:55 -0700 Subject: [PATCH 111/158] KVM: Move MMU notifier's mmu_lock acquisition into common helper mainline inclusion from mainline-v5.13-rc1 commit f922bd9bf33bd5a8c6694927f010f32127810fbf category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f922bd9bf33bd5a8c6694927f010f32127810fbf ---------------------------------------------------------------------- Acquire and release mmu_lock in the __kvm_handle_hva_range() helper instead of requiring the caller to do the same. This paves the way for future patches to take mmu_lock if and only if an overlapping memslot is found, without also having to introduce the on_lock() shenanigans used to manipulate the notifier count and sequence. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-8-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: virt/kvm/kvm_main.c Signed-off-by: Yu Zhang --- virt/kvm/kvm_main.c | 116 +++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 45 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e297e6a6460d..1749a6a6e0f1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -513,28 +513,57 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); +typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, + unsigned long end); + struct kvm_hva_range { unsigned long start; unsigned long end; pte_t pte; hva_handler_t handler; + on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; }; +/* + * Use a dedicated stub instead of NULL to indicate that there is no callback + * function/handler. The compiler technically can't guarantee that a real + * function will have a non-zero address, and so it will generate code to + * check for !NULL, whereas comparing against a stub will be elided at compile + * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). + */ +static void kvm_null_fn(void) +{ + +} +#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) + static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_hva_range *range) { + struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; - struct kvm_gfn_range gfn_range; bool ret = false; int i, idx; - lockdep_assert_held_write(&kvm->mmu_lock); + /* A null handler is allowed if and only if on_lock() is provided. */ + if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && + IS_KVM_NULL_FN(range->handler))) + return 0; + + KVM_MMU_LOCK(kvm); idx = srcu_read_lock(&kvm->srcu); + if (!IS_KVM_NULL_FN(range->on_lock)) { + range->on_lock(kvm, range->start, range->end); + + if (IS_KVM_NULL_FN(range->handler)) + goto out_unlock; + } + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, slots) { @@ -570,6 +599,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) kvm_flush_remote_tlbs(kvm); +out_unlock: + KVM_MMU_UNLOCK(kvm); + srcu_read_unlock(&kvm->srcu, idx); /* The notifiers are averse to booleans. :-( */ @@ -588,16 +620,12 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .end = end, .pte = pte, .handler = handler, + .on_lock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = false, }; - int ret; - - KVM_MMU_LOCK(kvm); - ret = __kvm_handle_hva_range(kvm, &range); - KVM_MMU_UNLOCK(kvm); - return ret; + return __kvm_handle_hva_range(kvm, &range); } static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, @@ -611,16 +639,41 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .end = end, .pte = __pte(0), .handler = handler, + .on_lock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = false, }; - int ret; - KVM_MMU_LOCK(kvm); - ret = __kvm_handle_hva_range(kvm, &range); - KVM_MMU_UNLOCK(kvm); + return __kvm_handle_hva_range(kvm, &range); +} - return ret; +static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + /* + * The count increase must become visible at unlock time as no + * spte can be established without taking the mmu_lock and + * count is also read inside the mmu_lock critical section. + */ + kvm->mmu_notifier_count++; + if (likely(kvm->mmu_notifier_count == 1)) { + kvm->mmu_notifier_range_start = start; + kvm->mmu_notifier_range_end = end; + } else { + /* + * Fully tracking multiple concurrent ranges has dimishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_notifier_range_start = + min(kvm->mmu_notifier_range_start, start); + kvm->mmu_notifier_range_end = + max(kvm->mmu_notifier_range_end, end); + } } #endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ @@ -663,6 +716,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .pte = __pte(0), .handler = kvm_unmap_gfn_range, + .on_lock = kvm_inc_notifier_count, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -672,47 +726,19 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, trace_kvm_unmap_hva_range(range->start, range->end); -#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS - idx = srcu_read_lock(&kvm->srcu); -#endif - - KVM_MMU_LOCK(kvm); - /* - * The count increase must become visible at unlock time as no - * spte can be established without taking the mmu_lock and - * count is also read inside the mmu_lock critical section. - */ - kvm->mmu_notifier_count++; - if (likely(kvm->mmu_notifier_count == 1)) { - kvm->mmu_notifier_range_start = range->start; - kvm->mmu_notifier_range_end = range->end; - } else { - /* - * Fully tracking multiple concurrent ranges has dimishing - * returns. Keep things simple and just find the minimal range - * which includes the current and new ranges. As there won't be - * enough information to subtract a range after its invalidate - * completes, any ranges invalidated concurrently will - * accumulate and persist until all outstanding invalidates - * complete. - */ - kvm->mmu_notifier_range_start = - min(kvm->mmu_notifier_range_start, range->start); - kvm->mmu_notifier_range_end = - max(kvm->mmu_notifier_range_end, range->end); - } - #ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS __kvm_handle_hva_range(kvm, &hva_range); #else + idx = srcu_read_lock(&kvm->srcu); + KVM_MMU_LOCK(kvm); need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush || kvm->tlbs_dirty) kvm_flush_remote_tlbs(kvm); -#endif - KVM_MMU_UNLOCK(kvm); +#endif + kvm_arch_guest_memory_reclaimed(kvm); #ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS srcu_read_unlock(&kvm->srcu, idx); -- Gitee From 12e0edba61b6e7f684dfea8de4e7d32f909d33e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:56 -0700 Subject: [PATCH 112/158] KVM: Take mmu_lock when handling MMU notifier iff the hva hits a memslot mainline inclusion from mainline-v5.13-rc1 commit 8931a454aea03bab21b3b8fcdc94f674eebd1c5d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8931a454aea03bab21b3b8fcdc94f674eebd1c5d ---------------------------------------------------------------------- Defer acquiring mmu_lock in the MMU notifier paths until a "hit" has been detected in the memslots, i.e. don't take the lock for notifications that don't affect the guest. For small VMs, spurious locking is a minor annoyance. And for "volatile" setups where the majority of notifications _are_ relevant, this barely qualifies as an optimization. But, for large VMs (hundreds of threads) with static setups, e.g. no page migration, no swapping, etc..., the vast majority of MMU notifier callbacks will be unrelated to the guest, e.g. will often be in response to the userspace VMM adjusting its own virtual address space. In such large VMs, acquiring mmu_lock can be painful as it blocks vCPUs from handling page faults. In some scenarios it can even be "fatal" in the sense that it causes unacceptable brownouts, e.g. when rebuilding huge pages after live migration, a significant percentage of vCPUs will be attempting to handle page faults. x86's TDP MMU implementation is especially susceptible to spurious locking due it taking mmu_lock for read when handling page faults. Because rwlock is fair, a single writer will stall future readers, while the writer is itself stalled waiting for in-progress readers to complete. This is exacerbated by the MMU notifiers often firing multiple times in quick succession, e.g. moving a page will (always?) invoke three separate notifiers: .invalidate_range_start(), invalidate_range_end(), and .change_pte(). Unnecessarily taking mmu_lock each time means even a single spurious sequence can be problematic. Note, this optimizes only the unpaired callbacks. Optimizing the .invalidate_range_{start,end}() pairs is more complex and will be done in a future patch. Suggested-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-9-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- virt/kvm/kvm_main.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1749a6a6e0f1..d82afb53e360 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -542,10 +542,10 @@ static void kvm_null_fn(void) static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_hva_range *range) { + bool ret = false, locked = false; struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; - bool ret = false; int i, idx; /* A null handler is allowed if and only if on_lock() is provided. */ @@ -553,11 +553,13 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, IS_KVM_NULL_FN(range->handler))) return 0; - KVM_MMU_LOCK(kvm); - idx = srcu_read_lock(&kvm->srcu); + /* The on_lock() path does not yet support lock elision. */ if (!IS_KVM_NULL_FN(range->on_lock)) { + locked = true; + KVM_MMU_LOCK(kvm); + range->on_lock(kvm, range->start, range->end); if (IS_KVM_NULL_FN(range->handler)) @@ -592,6 +594,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; + if (!locked) { + locked = true; + KVM_MMU_LOCK(kvm); + } ret |= range->handler(kvm, &gfn_range); } } @@ -600,7 +606,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); out_unlock: - KVM_MMU_UNLOCK(kvm); + if (locked) + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); -- Gitee From dea4c3b2cc449ca3907f7b69da6c41608b61cda8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:58 -0700 Subject: [PATCH 113/158] KVM: x86/mmu: Allow yielding during MMU notifier unmap/zap, if possible mainline inclusion from mainline-v5.13-rc1 commit e1eed5847b09fe41d4db4b86f9d840aba869c905 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e1eed5847b09fe41d4db4b86f9d840aba869c905 ---------------------------------------------------------------------- Let the TDP MMU yield when unmapping a range in response to a MMU notification, if yielding is allowed by said notification. There is no reason to disallow yielding in this case, and in theory the range being invalidated could be quite large. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-11-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index ce9fc229e225..4af7b1977f77 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -880,7 +880,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, for_each_tdp_mmu_root(kvm, root, range->slot->as_id) flush |= zap_gfn_range(kvm, root, range->start, range->end, - false, flush); + range->may_block, flush); return flush; } @@ -898,6 +898,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, rcu_read_lock(); + /* + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. + */ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) ret |= handler(kvm, &iter, range); -- Gitee From 7c5b95a2951976cf043f60129bd3c6c2ebce3246 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Mon, 16 Oct 2023 11:29:31 +0800 Subject: [PATCH 114/158] KVM: x86/mmu: Introduce the on_unlock hook to flush the cache for SEV Intel inclusion category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA ---------------------------------------------------------------------- Add on_unlock in struct kvm_hva_range, which is initialized to kvm_arch_guest_memory_reclaimed(), so that the CPU caches can be flushed when memory is reclaimed from an SEV guest. This is done in upstreaming commit 683412ccf612 ("KVM: SEV: add cache flush to solve SEV cache incoherency issues"), and backported into openEuler without using the on_unlock hook in commit 5e0a056024247, supposedly due to the lack of a group of MMU notifier consolidation and optimization patches - especially upstreaming commit f922bd9bf33b ("KVM: Move MMU notifier's mmu_lock acquisition into common helper"), and commit 8931a454aea0 ("KVM: Take mmu_lock when handling MMU notifier iff the hva hits a memslot"). With these 2 patches backported, we cook this seperate patch to align with upstreaming methods to flush cache when page is reclaimed for SEV guests. Signed-off-by: Yu Zhang --- virt/kvm/kvm_main.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d82afb53e360..6b21b74cf70f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -516,12 +516,15 @@ typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, unsigned long end); +typedef void (*on_unlock_fn_t)(struct kvm *kvm); + struct kvm_hva_range { unsigned long start; unsigned long end; pte_t pte; hva_handler_t handler; on_lock_fn_t on_lock; + on_unlock_fn_t on_unlock; bool flush_on_ret; bool may_block; }; @@ -606,8 +609,12 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); out_unlock: - if (locked) + if (locked) { KVM_MMU_UNLOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_unlock)) + range->on_unlock(kvm); + + } srcu_read_unlock(&kvm->srcu, idx); @@ -628,6 +635,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .pte = pte, .handler = handler, .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = false, }; @@ -647,6 +655,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .pte = __pte(0), .handler = handler, .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = false, }; @@ -724,6 +733,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .pte = __pte(0), .handler = kvm_unmap_gfn_range, .on_lock = kvm_inc_notifier_count, + .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; -- Gitee From 529c4ebd450c00a35adbbf5d01dd38a090a63908 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 6 Jan 2021 16:19:35 -0800 Subject: [PATCH 115/158] KVM: x86/mmu: Clarify TDP MMU page list invariants mainline inclusion from mainline-v5.11-rc3 commit c0dba6e46825716db15c4b3a8f05c85b4a59edda category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0dba6e46825716db15c4b3a8f05c85b4a59edda ---------------------------------------------------------------------- The tdp_mmu_roots and tdp_mmu_pages in struct kvm_arch should only contain pages with tdp_mmu_page set to true. tdp_mmu_pages should not contain any pages with a non-zero root_count and tdp_mmu_roots should only contain pages with a positive root_count, unless a thread holds the MMU lock and is in the process of modifying the list. Various functions expect these invariants to be maintained, but they are not explictily documented. Add to the comments on both fields to document the above invariants. Signed-off-by: Ben Gardon Message-Id: <20210107001935.3732070-2-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b545d6ebc990..77c6cbbcf2c8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1039,9 +1039,21 @@ struct kvm_arch { */ bool tdp_mmu_enabled; - /* List of struct tdp_mmu_pages being used as roots */ + /* + * List of struct kvmp_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set. + * All struct kvm_mmu_pages in the list should have a positive + * root_count except when a thread holds the MMU lock and is removing + * an entry from the list. + */ struct list_head tdp_mmu_roots; - /* List of struct tdp_mmu_pages not being used as roots */ + + /* + * List of struct kvmp_mmu_pages not being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set and a root_count of 0. + */ struct list_head tdp_mmu_pages; /* -- Gitee From 79f4f554b2ccf0e34e449f4a3faff4641b382060 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:24 -0700 Subject: [PATCH 116/158] KVM: x86/mmu: Re-add const qualifier in kvm_tdp_mmu_zap_collapsible_sptes mainline inclusion from mainline-v5.13-rc1 commit 8ca6f063b73d3754213d009efb3df486c8fe52d2 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8ca6f063b73d3754213d009efb3df486c8fe52d2 ---------------------------------------------------------------------- kvm_tdp_mmu_zap_collapsible_sptes unnecessarily removes the const qualifier from its memlsot argument, leading to a compiler warning. Add the const annotation and pass it to subsequent functions. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-2-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 10 +++++----- arch/x86/kvm/mmu/mmu_internal.h | 5 +++-- arch/x86/kvm/mmu/tdp_mmu.c | 5 +++-- arch/x86/kvm/mmu/tdp_mmu.h | 3 ++- include/linux/kvm_host.h | 2 +- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5ea95d8c8512..4e3f5a4dfad3 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -711,8 +711,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) * handling slots that are not large page aligned. */ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) + const struct kvm_memory_slot *slot, int level) { unsigned long idx; @@ -2711,7 +2710,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { unsigned long hva; pte_t *pte; @@ -2737,8 +2736,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, return level; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t pfn, int max_level) +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1685c34693bb..0440b3a48631 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -151,8 +151,9 @@ enum { #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) -int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t pfn, int max_level); +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4af7b1977f77..672e7850b574 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1241,7 +1241,7 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) */ static bool zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - struct kvm_memory_slot *slot, + const struct kvm_memory_slot *slot, bool flush) { gfn_t start = slot->base_gfn; @@ -1282,7 +1282,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, * be replaced by large mappings, for GFNs within the slot. */ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *slot, bool flush) + const struct kvm_memory_slot *slot, + bool flush) { struct kvm_mmu_page *root; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 76c9d613758f..4ef3b527e645 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -49,7 +49,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool wrprot); bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *slot, bool flush); + const struct kvm_memory_slot *slot, + bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 349e608b0608..5545ebd8de71 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1159,7 +1159,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) } static inline unsigned long -__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { /* * The index was checked originally in search_memslots. To avoid -- Gitee From bf29e6908b402a81b05d2f214aad69fe66bebaeb Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:25 -0700 Subject: [PATCH 117/158] KVM: x86/mmu: Move kvm_mmu_(get|put)_root to TDP MMU mainline inclusion from mainline-v5.13-rc1 commit 76eb54e7e717455b4c5b82cec5c879ea017610f5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76eb54e7e717455b4c5b82cec5c879ea017610f5 ---------------------------------------------------------------------- The TDP MMU is almost the only user of kvm_mmu_get_root and kvm_mmu_put_root. There is only one use of put_root in mmu.c for the legacy / shadow MMU. Open code that one use and move the get / put functions to the TDP MMU so they can be extended in future commits. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-3-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 10 ++++------ arch/x86/kvm/mmu/mmu_internal.h | 16 ---------------- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- arch/x86/kvm/mmu/tdp_mmu.h | 18 ++++++++++++++++++ 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4e3f5a4dfad3..52062ed3fa3b 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3131,12 +3131,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (WARN_ON(!sp)) return; - if (kvm_mmu_put_root(kvm, sp)) { - if (is_tdp_mmu_page(sp)) - kvm_tdp_mmu_free_root(kvm, sp); - else if (sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); - } + if (is_tdp_mmu_page(sp) && kvm_tdp_mmu_put_root(kvm, sp)) + kvm_tdp_mmu_free_root(kvm, sp); + else if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); *root_hpa = INVALID_PAGE; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 0440b3a48631..50e4d1daee0c 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -113,22 +113,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); -static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - BUG_ON(!sp->root_count); - lockdep_assert_held(&kvm->mmu_lock); - - ++sp->root_count; -} - -static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - lockdep_assert_held(&kvm->mmu_lock); - --sp->root_count; - - return !sp->root_count; -} - /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). * diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 672e7850b574..18231c8dabcb 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -43,7 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { - if (kvm_mmu_put_root(kvm, root)) + if (kvm_tdp_mmu_put_root(kvm, root)) kvm_tdp_mmu_free_root(kvm, root); } @@ -55,7 +55,7 @@ static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) return false; - kvm_mmu_get_root(kvm, root); + kvm_tdp_mmu_get_root(kvm, root); return true; } @@ -154,7 +154,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { if (root->role.word == role.word) { - kvm_mmu_get_root(kvm, root); + kvm_tdp_mmu_get_root(kvm, root); goto out; } } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 4ef3b527e645..4c371e074c99 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,6 +8,24 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); +static inline void kvm_tdp_mmu_get_root(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + BUG_ON(!root->root_count); + lockdep_assert_held(&kvm->mmu_lock); + + ++root->root_count; +} + +static inline bool kvm_tdp_mmu_put_root(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + lockdep_assert_held(&kvm->mmu_lock); + --root->root_count; + + return !root->root_count; +} + bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, -- Gitee From d876a4ed200acb41822d8c9ec7eb372eb3c06dd6 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:26 -0700 Subject: [PATCH 118/158] KVM: x86/mmu: use tdp_mmu_free_sp to free roots mainline inclusion from mainline-v5.13-rc1 commit 4bba36d72bf95038727d091a64dcb524dccc5da1 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4bba36d72bf95038727d091a64dcb524dccc5da1 ---------------------------------------------------------------------- Minor cleanup to deduplicate the code used to free a struct kvm_mmu_page in the TDP MMU. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-4-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 18231c8dabcb..d1f07aa1605b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -92,6 +92,12 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush); +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) +{ + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); +} + void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); @@ -105,8 +111,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) zap_gfn_range(kvm, root, 0, max_gfn, false, false); - free_page((unsigned long)root->spt); - kmem_cache_free(mmu_page_header_cache, root); + tdp_mmu_free_sp(root); } static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, @@ -168,12 +173,6 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) return __pa(root->spt); } -static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) -{ - free_page((unsigned long)sp->spt); - kmem_cache_free(mmu_page_header_cache, sp); -} - /* * This is called through call_rcu in order to free TDP page table memory * safely with respect to other kernel threads that may be operating on -- Gitee From 7f85e8ad1059f36e84f95fdc053cdffd4af55779 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:27 -0700 Subject: [PATCH 119/158] KVM: x86/mmu: Merge TDP MMU put and free root mainline inclusion from mainline-v5.13-rc1 commit 2bdb3d84cebca2e3b482788615ff1559bc8cedb5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2bdb3d84cebca2e3b482788615ff1559bc8cedb5 ---------------------------------------------------------------------- kvm_tdp_mmu_put_root and kvm_tdp_mmu_free_root are always called together, so merge the functions to simplify TDP MMU root refcounting / freeing. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-5-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 4 +-- arch/x86/kvm/mmu/tdp_mmu.c | 54 ++++++++++++++++++-------------------- arch/x86/kvm/mmu/tdp_mmu.h | 10 +------ 3 files changed, 28 insertions(+), 40 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 52062ed3fa3b..0f7b088ff99c 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3131,8 +3131,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (WARN_ON(!sp)) return; - if (is_tdp_mmu_page(sp) && kvm_tdp_mmu_put_root(kvm, sp)) - kvm_tdp_mmu_free_root(kvm, sp); + if (is_tdp_mmu_page(sp)) + kvm_tdp_mmu_put_root(kvm, sp); else if (!--sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d1f07aa1605b..69e085a14355 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -41,10 +41,31 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) rcu_barrier(); } -static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield, bool flush); + +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { - if (kvm_tdp_mmu_put_root(kvm, root)) - kvm_tdp_mmu_free_root(kvm, root); + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); +} + +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + + lockdep_assert_held_write(&kvm->mmu_lock); + + if (--root->root_count) + return; + + WARN_ON(!root->tdp_mmu_page); + + list_del(&root->link); + + zap_gfn_range(kvm, root, 0, max_gfn, false, false); + + tdp_mmu_free_sp(root); } static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, @@ -66,7 +87,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *next_root; next_root = list_next_entry(root, link); - tdp_mmu_put_root(kvm, root); + kvm_tdp_mmu_put_root(kvm, root); return next_root; } @@ -89,31 +110,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush); - -static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) -{ - free_page((unsigned long)sp->spt); - kmem_cache_free(mmu_page_header_cache, sp); -} - -void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) -{ - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WARN_ON(root->root_count); - WARN_ON(!root->tdp_mmu_page); - - list_del(&root->link); - - zap_gfn_range(kvm, root, 0, max_gfn, false, false); - - tdp_mmu_free_sp(root); -} - static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, int level) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 4c371e074c99..f895671229f0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -6,7 +6,6 @@ #include hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); static inline void kvm_tdp_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *root) @@ -17,14 +16,7 @@ static inline void kvm_tdp_mmu_get_root(struct kvm *kvm, ++root->root_count; } -static inline bool kvm_tdp_mmu_put_root(struct kvm *kvm, - struct kvm_mmu_page *root) -{ - lockdep_assert_held(&kvm->mmu_lock); - --root->root_count; - - return !root->root_count; -} +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush); -- Gitee From 1e7dcc7d46c56682ac9ea8c606fb0ac3f95495ec Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:28 -0700 Subject: [PATCH 120/158] KVM: x86/mmu: Refactor yield safe root iterator mainline inclusion from mainline-v5.13-rc1 commit cfc109979b3c879ea9df05e048bb83615964f3e3 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cfc109979b3c879ea9df05e048bb83615964f3e3 ---------------------------------------------------------------------- Refactor the yield safe TDP MMU root iterator to be more amenable to changes in future commits which will allow it to be used under the MMU lock in read mode. Currently the iterator requires a complicated dance between the helper functions and different parts of the for loop which makes it hard to reason about. Moving all the logic into a single function simplifies the iterator substantially. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-6-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 45 ++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 69e085a14355..2bffdf716e51 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -68,26 +68,34 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) tdp_mmu_free_sp(root); } -static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, - struct kvm_mmu_page *root) +/* + * Finds the next valid root after root (or the first valid root if root + * is NULL), takes a reference on it, and returns that next root. If root + * is not NULL, this thread should have already taken a reference on it, and + * that reference will be dropped. If no valid root is found, this + * function will return NULL. + */ +static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + struct kvm_mmu_page *prev_root) { - lockdep_assert_held_write(&kvm->mmu_lock); + struct kvm_mmu_page *next_root; - if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) - return false; + lockdep_assert_held_write(&kvm->mmu_lock); - kvm_tdp_mmu_get_root(kvm, root); - return true; + if (prev_root) + next_root = list_next_entry(prev_root, link); + else + next_root = list_first_entry(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); -} + if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link)) + next_root = NULL; + else + kvm_tdp_mmu_get_root(kvm, next_root); -static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, - struct kvm_mmu_page *root) -{ - struct kvm_mmu_page *next_root; + if (prev_root) + kvm_tdp_mmu_put_root(kvm, prev_root); - next_root = list_next_entry(root, link); - kvm_tdp_mmu_put_root(kvm, root); return next_root; } @@ -98,11 +106,10 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * recent root. (Unless keeping a live reference is desirable.) */ #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ - typeof(*_root), link); \ - tdp_mmu_next_root_valid(_kvm, _root); \ - _root = tdp_mmu_next_root(_kvm, _root)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ + for (_root = tdp_mmu_next_root(_kvm, NULL); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ -- Gitee From e6b13bdf35fc8d175f33a613b3ee79451caf9d0e Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:29 -0700 Subject: [PATCH 121/158] KVM: x86/mmu: Make TDP MMU root refcount atomic mainline inclusion from mainline-v5.13-rc1 commit 11cccf5c04721c8a08f9d72a1a5e7281a4041d86 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=11cccf5c04721c8a08f9d72a1a5e7281a4041d86 ---------------------------------------------------------------------- In order to parallelize more operations for the TDP MMU, make the refcount on TDP MMU roots atomic, so that a future patch can allow multiple threads to take a reference on the root concurrently, while holding the MMU lock in read mode. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-7-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 6 +++++- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.h | 10 +++++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 50e4d1daee0c..fb8a593e9d35 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -40,7 +40,11 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - int root_count; /* Currently serving as active root */ + /* Currently serving as active root */ + union { + int root_count; + refcount_t tdp_mmu_root_count; + }; unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 2bffdf716e51..1aaaef618bb6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -56,7 +56,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) lockdep_assert_held_write(&kvm->mmu_lock); - if (--root->root_count) + if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; WARN_ON(!root->tdp_mmu_page); @@ -168,7 +168,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) } root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); - root->root_count = 1; + refcount_set(&root->tdp_mmu_root_count, 1); list_add(&root->link, &kvm->arch.tdp_mmu_roots); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index f895671229f0..71f20707052d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -10,10 +10,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); static inline void kvm_tdp_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *root) { - BUG_ON(!root->root_count); - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); - ++root->root_count; + /* + * This should never fail since roots are removed from the roots + * list under the MMU write lock when their reference count falls + * to zero. + */ + refcount_inc_not_zero(&root->tdp_mmu_root_count); } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); -- Gitee From 4950f298f1467c3e5c3e7fed220d97b0b8d7326e Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:30 -0700 Subject: [PATCH 122/158] KVM: x86/mmu: handle cmpxchg failure in kvm_tdp_mmu_get_root mainline inclusion from mainline-v5.13-rc1 commit fb10129335ca6cc7a229226b03f54394757d773d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb10129335ca6cc7a229226b03f54394757d773d ---------------------------------------------------------------------- To reduce dependence on the MMU write lock, don't rely on the assumption that the atomic operation in kvm_tdp_mmu_get_root will always succeed. By not relying on that assumption, threads do not need to hold the MMU lock in write mode in order to take a reference on a TDP MMU root. In the root iterator, this change means that some roots might have to be skipped if they are found to have a zero refcount. This will still never happen as of this patch, but a future patch will need that flexibility to make the root iterator safe under the MMU read lock. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-8-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 11 ++++++----- arch/x86/kvm/mmu/tdp_mmu.h | 13 +++---------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1aaaef618bb6..ed797512548c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -88,10 +88,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, next_root = list_first_entry(&kvm->arch.tdp_mmu_roots, typeof(*next_root), link); + while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) && + !kvm_tdp_mmu_get_root(kvm, next_root)) + next_root = list_next_entry(next_root, link); + if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link)) next_root = NULL; - else - kvm_tdp_mmu_get_root(kvm, next_root); if (prev_root) kvm_tdp_mmu_put_root(kvm, prev_root); @@ -161,10 +163,9 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { - if (root->role.word == role.word) { - kvm_tdp_mmu_get_root(kvm, root); + if (root->role.word == role.word && + kvm_tdp_mmu_get_root(kvm, root)) goto out; - } } root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 71f20707052d..09d483476015 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,17 +7,10 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -static inline void kvm_tdp_mmu_get_root(struct kvm *kvm, - struct kvm_mmu_page *root) +__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, + struct kvm_mmu_page *root) { - lockdep_assert_held_write(&kvm->mmu_lock); - - /* - * This should never fail since roots are removed from the roots - * list under the MMU write lock when their reference count falls - * to zero. - */ - refcount_inc_not_zero(&root->tdp_mmu_root_count); + return refcount_inc_not_zero(&root->tdp_mmu_root_count); } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); -- Gitee From f1510accb418a0947615bfcba2c6e349e6d618c0 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:31 -0700 Subject: [PATCH 123/158] KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU mainline inclusion from mainline-v5.13-rc1 commit c0e64238ac53e8226e3fe72279e5e76253e85f88 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0e64238ac53e8226e3fe72279e5e76253e85f88 ---------------------------------------------------------------------- Protect the contents of the TDP MMU roots list with RCU in preparation for a future patch which will allow the iterator macro to be used under the MMU lock in read mode. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-9-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 21 +++++++--- arch/x86/kvm/mmu/tdp_mmu.c | 69 +++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 77c6cbbcf2c8..cf20e13d056a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1040,25 +1040,36 @@ struct kvm_arch { bool tdp_mmu_enabled; /* - * List of struct kvmp_mmu_pages being used as roots. + * List of struct kvm_mmu_pages being used as roots. * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. - * All struct kvm_mmu_pages in the list should have a positive - * root_count except when a thread holds the MMU lock and is removing - * an entry from the list. + * + * For reads, this list is protected by: + * the MMU lock in read mode + RCU or + * the MMU lock in write mode + * + * For writes, this list is protected by: + * the MMU lock in read mode + the tdp_mmu_pages_lock or + * the MMU lock in write mode + * + * Roots will remain in the list until their tdp_mmu_root_count + * drops to zero, at which point the thread that decremented the + * count to zero should removed the root from the list and clean + * it up, freeing the root after an RCU grace period. */ struct list_head tdp_mmu_roots; /* * List of struct kvmp_mmu_pages not being used as roots. * All struct kvm_mmu_pages in the list should have - * tdp_mmu_page set and a root_count of 0. + * tdp_mmu_page set and a tdp_mmu_root_count of 0. */ struct list_head tdp_mmu_pages; /* * Protects accesses to the following fields when the MMU lock * is held in read mode: + * - tdp_mmu_roots (above) * - tdp_mmu_pages (above) * - the link field of struct kvm_mmu_pages used by the TDP MMU * - lpage_disallowed_mmu_pages diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index ed797512548c..38f1bedf0627 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -50,6 +50,22 @@ static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) kmem_cache_free(mmu_page_header_cache, sp); } +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) +{ + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); + + tdp_mmu_free_sp(sp); +} + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); @@ -61,11 +77,13 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) WARN_ON(!root->tdp_mmu_page); - list_del(&root->link); + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_del_rcu(&root->link); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); zap_gfn_range(kvm, root, 0, max_gfn, false, false); - tdp_mmu_free_sp(root); + call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } /* @@ -82,18 +100,21 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, lockdep_assert_held_write(&kvm->mmu_lock); + rcu_read_lock(); + if (prev_root) - next_root = list_next_entry(prev_root, link); + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &prev_root->link, + typeof(*prev_root), link); else - next_root = list_first_entry(&kvm->arch.tdp_mmu_roots, - typeof(*next_root), link); + next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); - while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) && - !kvm_tdp_mmu_get_root(kvm, next_root)) - next_root = list_next_entry(next_root, link); + while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &next_root->link, typeof(*next_root), link); - if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link)) - next_root = NULL; + rcu_read_unlock(); if (prev_root) kvm_tdp_mmu_put_root(kvm, prev_root); @@ -107,15 +128,17 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * if exiting the loop early, the caller must drop the reference to the most * recent root. (Unless keeping a live reference is desirable.) */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ for (_root = tdp_mmu_next_root(_kvm, NULL); \ _root; \ _root = tdp_mmu_next_root(_kvm, _root)) \ if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ + lockdep_is_held_type(&kvm->mmu_lock, 0) || \ + lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else @@ -171,28 +194,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); refcount_set(&root->tdp_mmu_root_count, 1); - list_add(&root->link, &kvm->arch.tdp_mmu_roots); + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); out: return __pa(root->spt); } -/* - * This is called through call_rcu in order to free TDP page table memory - * safely with respect to other kernel threads that may be operating on - * the memory. - * By only accessing TDP MMU page table memory in an RCU read critical - * section, and freeing it after a grace period, lockless access to that - * memory won't use it after it is freed. - */ -static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) -{ - struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, - rcu_head); - - tdp_mmu_free_sp(sp); -} - static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -- Gitee From 18d66167d8d2540d773eab1caeffcc817816e34c Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:32 -0700 Subject: [PATCH 124/158] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock mainline inclusion from mainline-v5.13-rc1 commit 6103bc074048876794fa6d21fd8989331690ccbd category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6103bc074048876794fa6d21fd8989331690ccbd ---------------------------------------------------------------------- To reduce lock contention and interference with page fault handlers, allow the TDP MMU function to zap a GFN range to operate under the MMU read lock. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-10-bgardon@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/tdp_mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 22 ++++--- arch/x86/kvm/mmu/tdp_mmu.c | 114 +++++++++++++++++++++++++------------ arch/x86/kvm/mmu/tdp_mmu.h | 14 +++-- 3 files changed, 103 insertions(+), 47 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0f7b088ff99c..7e9191c84a32 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3132,7 +3132,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; if (is_tdp_mmu_page(sp)) - kvm_tdp_mmu_put_root(kvm, sp); + kvm_tdp_mmu_put_root(kvm, sp, false); else if (!--sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -5590,16 +5590,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } - if (is_tdp_mmu_enabled(kvm)) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush); - } - if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); write_unlock(&kvm->mmu_lock); + + if (is_tdp_mmu_enabled(kvm)) { + flush = false; + + read_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, + gfn_end, flush, true); + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end); + + read_unlock(&kvm->mmu_lock); + } } static bool slot_rmap_write_protect(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 38f1bedf0627..c4f4ed4accf3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); } +static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, + bool shared) +{ + if (shared) + lockdep_assert_held_read(&kvm->mmu_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); +} + void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { if (!kvm->arch.tdp_mmu_enabled) @@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) } static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush); + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared); static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { @@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } -void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - lockdep_assert_held_write(&kvm->mmu_lock); + kvm_lockdep_assert_mmu_lock_held(kvm, shared); if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; @@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - zap_gfn_range(kvm, root, 0, max_gfn, false, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -94,12 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) * function will return NULL. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, - struct kvm_mmu_page *prev_root) + struct kvm_mmu_page *prev_root, + bool shared) { struct kvm_mmu_page *next_root; - lockdep_assert_held_write(&kvm->mmu_lock); - rcu_read_lock(); if (prev_root) @@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, rcu_read_unlock(); if (prev_root) - kvm_tdp_mmu_put_root(kvm, prev_root); + kvm_tdp_mmu_put_root(kvm, prev_root, shared); return next_root; } @@ -127,12 +137,16 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * This makes it safe to release the MMU lock and yield within the loop, but * if exiting the loop early, the caller must drop the reference to the most * recent root. (Unless keeping a live reference is desirable.) + * + * If shared is set, this function is operating under the MMU lock in read + * mode. In the unlikely event that this thread must free a root, the lock + * will be temporarily dropped and reacquired in write mode. */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - for (_root = tdp_mmu_next_root(_kvm, NULL); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ @@ -636,7 +650,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * Return false if a yield was not needed. */ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush) + struct tdp_iter *iter, bool flush, + bool shared) { /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) @@ -648,7 +663,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); - cond_resched_rwlock_write(&kvm->mmu_lock); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_lock(); WARN_ON(iter->gfn > iter->next_last_level_gfn); @@ -666,24 +685,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. Note, in some use cases a flush may be - * required by prior actions. Ensure the pending flush is performed prior to - * yielding. + * operation can cause a soft lockup. + * + * If shared is true, this thread holds the MMU lock in read mode and must + * account for the possibility that other threads are modifying the paging + * structures concurrently. If shared is false, this thread should hold the + * MMU lock in write mode. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush) + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared) { struct tdp_iter iter; + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { +retry: if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { flush = false; continue; } @@ -701,8 +728,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - flush = true; + if (!shared) { + tdp_mmu_set_spte(kvm, &iter, 0); + flush = true; + } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } } rcu_read_unlock(); @@ -714,14 +750,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * + * If shared is true, this thread holds the MMU lock in read mode and must + * account for the possibility that other threads are modifying the paging + * structures concurrently. If shared is false, this thread should hold the + * MMU in write mode. */ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush) + gfn_t end, bool can_yield, bool flush, + bool shared) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, + shared); return flush; } @@ -733,7 +776,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, + flush, false); if (flush) kvm_flush_remote_tlbs(kvm); @@ -892,7 +936,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, for_each_tdp_mmu_root(kvm, root, range->slot->as_id) flush |= zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush); + range->may_block, flush, false); return flush; } @@ -1038,7 +1082,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, for_each_tdp_pte_min_level(iter, root->spt, root->role.level, min_level, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false)) continue; if (!is_shadow_present_pte(iter.old_spte) || @@ -1067,7 +1111,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_mmu_page *root; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); @@ -1091,7 +1135,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false)) continue; if (!is_shadow_present_pte(iter.old_spte)) @@ -1129,7 +1173,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) struct kvm_mmu_page *root; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); @@ -1214,7 +1258,7 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false)) continue; if (!is_shadow_present_pte(iter.old_spte) || @@ -1241,7 +1285,7 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) struct kvm_mmu_page *root; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); return spte_set; @@ -1264,7 +1308,7 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; continue; } @@ -1299,7 +1343,7 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) flush = zap_collapsible_spte_range(kvm, root, slot, flush); return flush; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 09d483476015..22072d96ed5d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -13,14 +13,18 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, return refcount_inc_not_zero(&root->tdp_mmu_root_count); } -void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush); + gfn_t end, bool can_yield, bool flush, + bool shared); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush) + gfn_t start, gfn_t end, bool flush, + bool shared) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, + shared); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -32,7 +36,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) */ lockdep_assert_held_write(&kvm->mmu_lock); return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false); + sp->gfn, end, false, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); -- Gitee From 274e0e4c94eceabcc1a7adb31a3aea8921408fd0 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:33 -0700 Subject: [PATCH 125/158] KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU read lock mainline inclusion from mainline-v5.13-rc1 commit 2db6f772b530eedcf69069e63dd7c4fdf05305fc category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2db6f772b530eedcf69069e63dd7c4fdf05305fc ---------------------------------------------------------------------- To reduce the impact of disabling dirty logging, change the TDP MMU function which zaps collapsible SPTEs to run under the MMU read lock. This way, page faults on zapped SPTEs can proceed in parallel with kvm_mmu_zap_collapsible_sptes. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-11-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 14 ++++++++++---- arch/x86/kvm/mmu/tdp_mmu.c | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7e9191c84a32..35711f2dffa8 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5695,13 +5695,19 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (is_tdp_mmu_enabled(kvm)) - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); - if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); - write_unlock(&kvm->mmu_lock); + + if (is_tdp_mmu_enabled(kvm)) { + flush = false; + + read_lock(&kvm->mmu_lock); + flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + read_unlock(&kvm->mmu_lock); + } } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c4f4ed4accf3..1079ca848f0a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1308,7 +1308,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { flush = false; continue; } @@ -1323,8 +1324,14 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, pfn, PG_LEVEL_NUM)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - + if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } flush = true; } @@ -1343,7 +1350,9 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) + lockdep_assert_held_read(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) flush = zap_collapsible_spte_range(kvm, root, slot, flush); return flush; -- Gitee From ac11ca5ab371f5269a8bd4bf768e2c9b6b7026b0 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:34 -0700 Subject: [PATCH 126/158] KVM: x86/mmu: Allow enabling/disabling dirty logging under MMU read lock mainline inclusion from mainline-v5.13-rc1 commit 24ae4cfaaaa22a4f293acd0c7d97804454b7e9fb category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=24ae4cfaaaa22a4f293acd0c7d97804454b7e9fb ---------------------------------------------------------------------- To reduce lock contention and interference with page fault handlers, allow the TDP MMU functions which enable and disable dirty logging to operate under the MMU read lock. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-12-bgardon@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/tdp_mmu.c arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 32 +++++++++++---- arch/x86/kvm/mmu/tdp_mmu.c | 80 ++++++++++++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 35711f2dffa8..f2fafc2fe73f 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5626,10 +5626,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + read_unlock(&kvm->mmu_lock); + } + /* * We can flush all the TLBs out of the mmu lock without TLB * corruption since we just change the spte from writable to @@ -5732,10 +5736,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + read_unlock(&kvm->mmu_lock); + } + /* * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB @@ -5755,10 +5763,14 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); + read_unlock(&kvm->mmu_lock); + } + if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } @@ -5771,10 +5783,14 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); + read_unlock(&kvm->mmu_lock); + } + if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1079ca848f0a..6677aba331c7 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -495,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } /* - * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the - * associated bookkeeping + * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping, but do not mark the page dirty + * in KVM's dirty bitmaps. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set @@ -504,9 +505,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { lockdep_assert_held_read(&kvm->mmu_lock); @@ -521,9 +522,22 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); + handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); + + return true; +} + +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) + return false; + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, + iter->old_spte, new_spte, iter->level); return true; } @@ -1082,7 +1096,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, for_each_tdp_pte_min_level(iter, root->spt, root->role.level, min_level, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false)) +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte) || @@ -1092,7 +1107,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, + new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } @@ -1111,7 +1134,9 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_mmu_page *root; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) + lockdep_assert_held_read(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); @@ -1135,7 +1160,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false)) +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte)) @@ -1153,7 +1179,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; } - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, + new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } @@ -1173,7 +1207,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) struct kvm_mmu_page *root; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) + lockdep_assert_held_read(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); @@ -1257,8 +1293,9 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false)) + tdp_root_for_each_leaf_pte(iter, root, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte) || @@ -1267,7 +1304,14 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, new_spte = iter.old_spte | shadow_dirty_mask; - tdp_mmu_set_spte(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } @@ -1285,7 +1329,9 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) struct kvm_mmu_page *root; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false) + lockdep_assert_held_read(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); return spte_set; -- Gitee From 699b4d8f8bd08dec392a7515d9111150a6cece13 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:35 -0700 Subject: [PATCH 127/158] KVM: x86/mmu: Fast invalidation for TDP MMU mainline inclusion from mainline-v5.13-rc1 commit b7cccd397f310739fb85383033e95580f99927e0 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b7cccd397f310739fb85383033e95580f99927e0 ---------------------------------------------------------------------- Provide a real mechanism for fast invalidation by marking roots as invalid so that their reference count will quickly fall to zero and they will be torn down. One negative side affect of this approach is that a vCPU thread will likely drop the last reference to a root and be saddled with the work of tearing down an entire paging structure. This issue will be resolved in a later commit. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-13-bgardon@google.com> [Move the loop to tdp_mmu.c, otherwise compilation fails on 32-bit. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 12 +++++++++--- arch/x86/kvm/mmu/tdp_mmu.c | 17 +++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 5 +++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f2fafc2fe73f..998ace418e72 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5515,6 +5515,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; + /* In order to ensure all threads see this change when + * handling the MMU reload signal, this must happen in the + * same critical section as kvm_reload_remote_mmus, and + * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages + * could drop the MMU lock and yield. + */ + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_invalidate_all_roots(kvm); + /* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new @@ -5527,9 +5536,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_all(kvm); - write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6677aba331c7..ac60c30a1663 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -797,6 +797,23 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) kvm_flush_remote_tlbs(kvm); } +/* + * Mark each TDP MMU root as invalid so that other threads + * will drop their references and allow the root count to + * go to 0. + * + * This has essentially the same effect for the TDP MMU + * as updating mmu_valid_gen does for the shadow MMU. + */ +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +{ + struct kvm_mmu_page *root; + + lockdep_assert_held_write(&kvm->mmu_lock); + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) + root->role.invalid = true; +} + /* * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 22072d96ed5d..927bc233589f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -10,6 +10,9 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *root) { + if (root->role.invalid) + return false; + return refcount_inc_not_zero(&root->tdp_mmu_root_count); } @@ -38,7 +41,9 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), sp->gfn, end, false, false, false); } + void kvm_tdp_mmu_zap_all(struct kvm *kvm); +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, -- Gitee From 84a9f471c9b71b9d8e0a652ddd0425dc20d1186e Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:36 -0700 Subject: [PATCH 128/158] KVM: x86/mmu: Tear down roots before kvm_mmu_zap_all_fast returns mainline inclusion from mainline-v5.13-rc1 commit 4c6654bd160dbf4503b360ef8eed80b99eb1b8d9 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c6654bd160dbf4503b360ef8eed80b99eb1b8d9 ---------------------------------------------------------------------- To avoid saddling a vCPU thread with the work of tearing down an entire paging structure, take a reference on each root before they become obsolete, so that the thread initiating the fast invalidation can tear down the paging structure and (most likely) release the last reference. As a bonus, this teardown can happen under the MMU lock in read mode so as not to block the progress of vCPU threads. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-14-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 6 +++ arch/x86/kvm/mmu/tdp_mmu.c | 83 +++++++++++++++++++++++++++++++++++++- arch/x86/kvm/mmu/tdp_mmu.h | 1 + 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 998ace418e72..643b79da45ec 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5537,6 +5537,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); write_unlock(&kvm->mmu_lock); + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_zap_invalidated_roots(kvm); + read_unlock(&kvm->mmu_lock); + } } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index ac60c30a1663..e1ab5a2c687f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -797,11 +797,91 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) kvm_flush_remote_tlbs(kvm); } +static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, + struct kvm_mmu_page *prev_root) +{ + struct kvm_mmu_page *next_root; + + if (prev_root) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &prev_root->link, + typeof(*prev_root), link); + else + next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); + + while (next_root && !(next_root->role.invalid && + refcount_read(&next_root->tdp_mmu_root_count))) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &next_root->link, + typeof(*next_root), link); + + return next_root; +} + +/* + * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each + * invalidated root, they will not be freed until this function drops the + * reference. Before dropping that reference, tear down the paging + * structure so that whichever thread does drop the last reference + * only has to do a trivial amount of work. Since the roots are invalid, + * no new SPTEs should be created under them. + */ +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + struct kvm_mmu_page *next_root; + struct kvm_mmu_page *root; + bool flush = false; + + lockdep_assert_held_read(&kvm->mmu_lock); + + rcu_read_lock(); + + root = next_invalidated_root(kvm, NULL); + + while (root) { + next_root = next_invalidated_root(kvm, root); + + rcu_read_unlock(); + + flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, + true); + + /* + * Put the reference acquired in + * kvm_tdp_mmu_invalidate_roots + */ + kvm_tdp_mmu_put_root(kvm, root, true); + + root = next_root; + + rcu_read_lock(); + } + + rcu_read_unlock(); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} + /* * Mark each TDP MMU root as invalid so that other threads * will drop their references and allow the root count to * go to 0. * + * Also take a reference on all roots so that this thread + * can do the bulk of the work required to free the roots + * once they are invalidated. Without this reference, a + * vCPU thread might drop the last reference to a root and + * get stuck with tearing down the entire paging structure. + * + * Roots which have a zero refcount should be skipped as + * they're already being torn down. + * Already invalid roots should be referenced again so that + * they aren't freed before kvm_tdp_mmu_zap_all_fast is + * done with them. + * * This has essentially the same effect for the TDP MMU * as updating mmu_valid_gen does for the shadow MMU. */ @@ -811,7 +891,8 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) lockdep_assert_held_write(&kvm->mmu_lock); list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) - root->role.invalid = true; + if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) + root->role.invalid = true; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 927bc233589f..bf83ce179c17 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -44,6 +44,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, -- Gitee From fe75023f66a0431dcec2a419111afaf98483b367 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 26 Feb 2022 00:15:21 +0000 Subject: [PATCH 129/158] KVM: x86/mmu: Fix wrong/misleading comments in TDP MMU fast zap mainline inclusion from mainline-v5.18-rc1 commit f28e9c7fce14a8c82f9624c9efeccf40b046c522 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f28e9c7fce14a8c82f9624c9efeccf40b046c522 ---------------------------------------------------------------------- Fix misleading and arguably wrong comments in the TDP MMU's fast zap flow. The comments, and the fact that actually zapping invalid roots was added separately, strongly suggests that zapping invalid roots is an optimization and not required for correctness. That is a lie. KVM _must_ zap invalid roots before returning from kvm_mmu_zap_all_fast(), because when it's called from kvm_mmu_invalidate_zap_pages_in_memslot(), KVM is relying on it to fully remove all references to the memslot. Once the memslot is gone, KVM's mmu_notifier hooks will be unable to find the stale references as the hva=>gfn translation is done via the memslots. If KVM doesn't immediately zap SPTEs and userspace unmaps a range after deleting a memslot, KVM will fail to zap in response to the mmu_notifier due to not finding a memslot corresponding to the notifier's range, which leads to a variation of use-after-free. The other misleading comment (and code) explicitly states that roots without a reference should be skipped. While that's technically true, it's also extremely misleading as it should be impossible for KVM to encounter a defunct root on the list while holding mmu_lock for write. Opportunistically add a WARN to enforce that invariant. Fixes: b7cccd397f31 ("KVM: x86/mmu: Fast invalidation for TDP MMU") Fixes: 4c6654bd160d ("KVM: x86/mmu: Tear down roots before kvm_mmu_zap_all_fast returns") Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Message-Id: <20220226001546.360188-4-seanjc@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/tdp_mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 8 +++++++ arch/x86/kvm/mmu/tdp_mmu.c | 46 +++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 643b79da45ec..f46752f44909 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5538,6 +5538,14 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) write_unlock(&kvm->mmu_lock); + /* + * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before + * returning to the caller, e.g. if the zap is in response to a memslot + * deletion, mmu_notifier callbacks will be unable to reach the SPTEs + * associated with the deleted memslot once the update completes, and + * Deferring the zap until the final reference to the root is put would + * lead to use-after-free. + */ if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_zap_invalidated_roots(kvm); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e1ab5a2c687f..182cd01d7173 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -820,12 +820,11 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, } /* - * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each - * invalidated root, they will not be freed until this function drops the - * reference. Before dropping that reference, tear down the paging - * structure so that whichever thread does drop the last reference - * only has to do a trivial amount of work. Since the roots are invalid, - * no new SPTEs should be created under them. + * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast + * zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a + * reference to each invalidated root, roots will not be freed until after this + * function drops the gifted reference, e.g. so that vCPUs don't get stuck with + * tearing down paging structures. */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { @@ -866,21 +865,25 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) } /* - * Mark each TDP MMU root as invalid so that other threads - * will drop their references and allow the root count to - * go to 0. + * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that + * is about to be zapped, e.g. in response to a memslots update. The caller is + * responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual + * zapping. * - * Also take a reference on all roots so that this thread - * can do the bulk of the work required to free the roots - * once they are invalidated. Without this reference, a - * vCPU thread might drop the last reference to a root and - * get stuck with tearing down the entire paging structure. + * Take a reference on all roots to prevent the root from being freed before it + * is zapped by this thread. Freeing a root is not a correctness issue, but if + * a vCPU drops the last reference to a root prior to the root being zapped, it + * will get stuck with tearing down the entire paging structure. * - * Roots which have a zero refcount should be skipped as - * they're already being torn down. - * Already invalid roots should be referenced again so that - * they aren't freed before kvm_tdp_mmu_zap_all_fast is - * done with them. + * Get a reference even if the root is already invalid, + * kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all + * invalid roots, e.g. there's no epoch to identify roots that were invalidated + * by a previous call. Roots stay on the list until the last reference is + * dropped, so even though all invalid roots are zapped, a root may not go away + * for quite some time, e.g. if a vCPU blocks across multiple memslot updates. + * + * Because mmu_lock is held for write, it should be impossible to observe a + * root with zero refcount, i.e. the list of roots cannot be stale. * * This has essentially the same effect for the TDP MMU * as updating mmu_valid_gen does for the shadow MMU. @@ -890,9 +893,10 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) - if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(kvm, root))) root->role.invalid = true; + } } /* -- Gitee From e53d212c4e8a317ccb206a1dc5c9070504566cc2 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 29 Apr 2021 16:12:26 +1200 Subject: [PATCH 130/158] KVM: x86/mmu: Avoid unnecessary page table allocation in kvm_tdp_mmu_map() mainline inclusion from mainline-v5.13-rc2 commit ff76d506030daeeeb967be8b8a189bf7aee8e7a8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ff76d506030daeeeb967be8b8a189bf7aee8e7a8 ---------------------------------------------------------------------- In kvm_tdp_mmu_map(), while iterating TDP MMU page table entries, it is possible SPTE has already been frozen by another thread but the frozen is not done yet, for instance, when another thread is still in middle of zapping large page. In this case, the !is_shadow_present_pte() check for old SPTE in tdp_mmu_for_each_pte() may hit true, and in this case allocating new page table is unnecessary since tdp_mmu_set_spte_atomic() later will return false and page table will need to be freed. Add is_removed_spte() check before allocating new page table to avoid this. Signed-off-by: Kai Huang Message-Id: <20210429041226.50279-1-kai.huang@intel.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 182cd01d7173..21b49508805c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1013,6 +1013,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } if (!is_shadow_present_pte(iter.old_spte)) { + /* + * If SPTE has been forzen by another thread, just + * give up and retry, avoiding unnecessary page table + * allocation and free. + */ + if (is_removed_spte(iter.old_spte)) + break; + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); child_pt = sp->spt; -- Gitee From 5b6b7a33128558cc2db5f4b8a5cd2d4fc65b363c Mon Sep 17 00:00:00 2001 From: "Shahin, Md Shahadat Hossain" Date: Fri, 30 Apr 2021 11:52:31 +0000 Subject: [PATCH 131/158] kvm/x86: Fix 'lpages' kvm stat for TDM MMU mainline inclusion from mainline-v5.13-rc2 commit 1699f65c8b658d434fe92563c906cd1a136c9cb6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1699f65c8b658d434fe92563c906cd1a136c9cb6 ---------------------------------------------------------------------- Large pages not being created properly may result in increased memory access time. The 'lpages' kvm stat used to keep track of the current number of large pages in the system, but with TDP MMU enabled the stat is not showing the correct number. This patch extends the lpages counter to cover the TDP case. Signed-off-by: Md Shahadat Hossain Shahin Cc: Bartosz Szczepanek Message-Id: <1619783551459.35424@amazon.de> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 21b49508805c..a2b7350128e6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + if (is_large_pte(old_spte) != is_large_pte(new_spte)) { + if (is_large_pte(old_spte)) + atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); + else + atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); + } + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ -- Gitee From 601d1dd621c7116b0df73f94c50fa66a2415a8a3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 May 2021 16:32:27 +0000 Subject: [PATCH 132/158] KVM: x86/mmu: Fix comment mentioning skip_4k mainline inclusion from mainline-v5.13-rc4 commit bedd9195df3dfea7165e7d6f7519a1568bc41936 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bedd9195df3dfea7165e7d6f7519a1568bc41936 ---------------------------------------------------------------------- This comment was left over from a previous version of the patch that introduced wrprot_gfn_range, when skip_4k was passed in instead of min_level. Signed-off-by: David Matlack Message-Id: <20210526163227.3113557-1-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a2b7350128e6..e0d63ce83673 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1196,9 +1196,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) } /* - * Remove write access from all the SPTEs mapping GFNs [start, end). If - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. + * Remove write access from all SPTEs at or above min_level that map GFNs + * [start, end). Returns true if an SPTE has been changed and the TLBs need to + * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) -- Gitee From 1641b0ba29729887838c26dc6a35da5adb14c5ee Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Thu, 27 May 2021 15:57:51 +0800 Subject: [PATCH 133/158] KVM: x86/mmu: Make is_nx_huge_page_enabled an inline function mainline inclusion from mainline-v5.14-rc1 commit a9d6496d667fdb86713868a402378a0e4db62b50 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a9d6496d667fdb86713868a402378a0e4db62b50 ---------------------------------------------------------------------- Function 'is_nx_huge_page_enabled' is called only by kvm/mmu, so make it as inline fucntion and remove the unnecessary declaration. Cc: Ben Gardon Cc: Paolo Bonzini Cc: Sean Christopherson Suggested-by: Sean Christopherson Signed-off-by: Shaokun Zhang Message-Id: <1622102271-63107-1-git-send-email-zhangshaokun@hisilicon.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 7 +------ arch/x86/kvm/mmu/mmu_internal.h | 9 ++++++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f46752f44909..e3b98537b149 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -56,7 +56,7 @@ extern bool itlb_multihit_kvm_mitigation; -static int __read_mostly nx_huge_pages = -1; +int __read_mostly nx_huge_pages = -1; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -210,11 +210,6 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index fb8a593e9d35..610ec9ee1052 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -106,7 +106,12 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) return vcpu->arch.mmu == &vcpu->arch.guest_mmu; } -bool is_nx_huge_page_enabled(void); +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); @@ -148,8 +153,6 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, kvm_pfn_t *pfnp, int *goal_levelp); -bool is_nx_huge_page_enabled(void); - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); -- Gitee From 173fff8968701d25b3e71ca613fe35c65d82fb92 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:08 -0700 Subject: [PATCH 134/158] KVM: x86/mmu: Deduplicate rmap freeing mainline inclusion from mainline-v5.14-rc1 commit c9b929b3fadc0504605d29016eb8274358c7d3ed category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c9b929b3fadc0504605d29016eb8274358c7d3ed ---------------------------------------------------------------------- Small code deduplication. No functional change expected. Reviewed-by: David Hildenbrand Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-2-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/x86.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bc2d26f67bf2..8a7d55d8e76b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10985,17 +10985,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +static void memslot_rmap_free(struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; + } +} - if (i == 0) - continue; +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + int i; + + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -11061,12 +11067,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } -- Gitee From 952e4654271a3c936d94682c7e57b12fe60c9ab2 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:09 -0700 Subject: [PATCH 135/158] KVM: x86/mmu: Factor out allocating memslot rmap mainline inclusion from mainline-v5.14-rc1 commit 56dd1019c88510e79a820965a2da35907fbab00d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=56dd1019c88510e79a820965a2da35907fbab00d ---------------------------------------------------------------------- Small refactor to facilitate allocating rmaps for all memslots at once. No functional change expected. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-3-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8a7d55d8e76b..666989492914 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11009,10 +11009,31 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } +static int memslot_rmap_alloc(struct kvm_memory_slot *slot, + unsigned long npages) +{ + const int sz = sizeof(*slot->arch.rmap[0]); + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + int level = i + 1; + int lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + if (!slot->arch.rmap[i]) { + memslot_rmap_free(slot); + return -ENOMEM; + } + } + + return 0; +} + static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, unsigned long npages) { - int i; + int i, r; /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The @@ -11021,7 +11042,11 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, */ memset(&slot->arch, 0, sizeof(slot->arch)); - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; @@ -11030,14 +11055,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; - slot->arch.rmap[i] = - kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; -- Gitee From 0517483a12d3376ba89c9302c1c50d925a3a5446 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:10 -0700 Subject: [PATCH 136/158] KVM: mmu: Refactor memslot copy mainline inclusion from mainline-v5.14-rc1 commit ddc12f2a12917c10b0deb0928f0560bffb7729ec category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ddc12f2a12917c10b0deb0928f0560bffb7729ec ---------------------------------------------------------------------- Factor out copying kvm_memslots from allocating the memory for new ones in preparation for adding a new lock to protect the arch-specific fields of the memslots. No functional change intended. Reviewed-by: David Hildenbrand Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-4-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- virt/kvm/kvm_main.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6b21b74cf70f..b9ca23acf8eb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1470,6 +1470,18 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, return old_memslots; } +static size_t kvm_memslots_size(int slots) +{ + return sizeof(struct kvm_memslots) + + (sizeof(struct kvm_memory_slot) * slots); +} + +static void kvm_copy_memslots(struct kvm_memslots *to, + struct kvm_memslots *from) +{ + memcpy(to, from, kvm_memslots_size(from->used_slots)); +} + /* * Note, at a minimum, the current number of used slots must be allocated, even * when deleting a memslot, as we need a complete duplicate of the memslots for @@ -1479,19 +1491,16 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, enum kvm_mr_change change) { struct kvm_memslots *slots; - size_t old_size, new_size; - - old_size = sizeof(struct kvm_memslots) + - (sizeof(struct kvm_memory_slot) * old->used_slots); + size_t new_size; if (change == KVM_MR_CREATE) - new_size = old_size + sizeof(struct kvm_memory_slot); + new_size = kvm_memslots_size(old->used_slots + 1); else - new_size = old_size; + new_size = kvm_memslots_size(old->used_slots); slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); if (likely(slots)) - memcpy(slots, old, old_size); + kvm_copy_memslots(slots, old); return slots; } -- Gitee From b481c83315eae37934e5760c98a987f58bc8d864 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:11 -0700 Subject: [PATCH 137/158] KVM: mmu: Add slots_arch_lock for memslot arch fields mainline inclusion from mainline-v5.14-rc1 commit b10a038e84d188e15819058b2978b2daa9853aeb category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b10a038e84d188e15819058b2978b2daa9853aeb ---------------------------------------------------------------------- Add a new lock to protect the arch-specific fields of memslots if they need to be modified in a kvm->srcu read critical section. A future commit will use this lock to lazily allocate memslot rmaps for x86. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-5-bgardon@google.com> [Add Documentation/ hunk. - Paolo] Signed-off-by: Paolo Bonzini conflictt: virt/kvm/kvm_main.c Signed-off-by: Yu Zhang --- Documentation/virt/kvm/locking.rst | 5 +++ include/linux/kvm_host.h | 9 +++++ virt/kvm/kvm_main.c | 54 ++++++++++++++++++++++++++---- 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 1fc860c007a3..35eca377543d 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -16,6 +16,11 @@ The acquisition orders for mutexes are as follows: - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring them together is quite rare. +- Unlike kvm->slots_lock, kvm->slots_arch_lock is released before + synchronize_srcu(&kvm->srcu). Therefore kvm->slots_arch_lock + can be taken inside a kvm->srcu read-side critical section, + while kvm->slots_lock cannot. + On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5545ebd8de71..148bffa3a994 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -474,6 +474,15 @@ struct kvm { #endif /* KVM_HAVE_MMU_RWLOCK */ struct mutex slots_lock; + + /* + * Protects the arch-specific fields of struct kvm_memory_slots in + * use by the VM. To be used under the slots_lock (above) or in a + * kvm->srcu critical section where acquiring the slots_lock would + * lead to deadlock with the synchronize_srcu in + * install_new_memslots. + */ + struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b9ca23acf8eb..4c63787c7b08 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1058,6 +1058,7 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); + mutex_init(&kvm->slots_arch_lock); INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1444,6 +1445,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; rcu_assign_pointer(kvm->memslots[as_id], slots); + + /* + * Acquired in kvm_set_memslot. Must be released before synchronize + * SRCU below in order to avoid deadlock with another thread + * acquiring the slots_arch_lock in an srcu critical section. + */ + mutex_unlock(&kvm->slots_arch_lock); + synchronize_srcu_expedited(&kvm->srcu); /* @@ -1515,9 +1524,27 @@ static int kvm_set_memslot(struct kvm *kvm, struct kvm_memslots *slots; int r; + /* + * Released in install_new_memslots. + * + * Must be held from before the current memslots are copied until + * after the new memslots are installed with rcu_assign_pointer, + * then released before the synchronize srcu in install_new_memslots. + * + * When modifying memslots outside of the slots_lock, must be held + * before reading the pointer to the current memslots until after all + * changes to those memslots are complete. + * + * These rules ensure that installing new memslots does not lose + * changes made to the previous memslots. + */ + mutex_lock(&kvm->slots_arch_lock); + slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); - if (!slots) + if (!slots) { + mutex_unlock(&kvm->slots_arch_lock); return -ENOMEM; + } if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { /* @@ -1528,10 +1555,9 @@ static int kvm_set_memslot(struct kvm *kvm, slot->flags |= KVM_MEMSLOT_INVALID; /* - * We can re-use the old memslots, the only difference from the - * newly installed memslots is the invalid flag, which will get - * dropped by update_memslots anyway. We'll also revert to the - * old memslots if preparing the new memory region fails. + * We can re-use the memory from the old memslots. + * It will be overwritten with a copy of the new memslots + * after reacquiring the slots_arch_lock below. */ slots = install_new_memslots(kvm, as_id, slots); @@ -1544,6 +1570,17 @@ static int kvm_set_memslot(struct kvm *kvm, */ kvm_arch_flush_shadow_memslot(kvm, slot); kvm_arch_guest_memory_reclaimed(kvm); + + /* Released in install_new_memslots. */ + mutex_lock(&kvm->slots_arch_lock); + + /* + * The arch-specific fields of the memslots could have changed + * between releasing the slots_arch_lock in + * install_new_memslots and here, so get a fresh copy of the + * slots. + */ + kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id)); } r = kvm_arch_prepare_memory_region(kvm, new, mem, change); @@ -1559,8 +1596,13 @@ static int kvm_set_memslot(struct kvm *kvm, return 0; out_slots: - if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { + slot = id_to_memslot(slots, old->id); + slot->flags &= ~KVM_MEMSLOT_INVALID; slots = install_new_memslots(kvm, as_id, slots); + } else { + mutex_unlock(&kvm->slots_arch_lock); + } kvfree(slots); return r; } -- Gitee From f1e554b63021c8c81db0743eac5311d5a1b4bd3c Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:12 -0700 Subject: [PATCH 138/158] KVM: x86/mmu: Add a field to control memslot rmap allocation mainline inclusion from mainline-v5.14-rc1 commit a255740876f006eb9041fadcc4750557d26add5f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a255740876f006eb9041fadcc4750557d26add5f ---------------------------------------------------------------------- Add a field to control whether new memslots should have rmaps allocated for them. As of this change, it's not safe to skip allocating rmaps, so the field is always set to allocate rmaps. Future changes will make it safe to operate without rmaps, using the TDP MMU. Then further changes will allow the rmaps to be allocated lazily when needed for nested oprtation. No functional change expected. Reviewed-by: David Hildenbrand Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-6-bgardon@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/include/asm/kvm_host.h Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/mmu/mmu.c | 2 ++ arch/x86/kvm/x86.c | 13 ++++++++----- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cf20e13d056a..101686e245f1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1087,6 +1087,12 @@ struct kvm_arch { * the global KVM_MAX_VCPU_ID may lead to significant memory waste. */ u32 max_vcpu_ids; + + /* + * If set, rmaps have been allocated for all memslots and should be + * allocated for any newly created or modified memslots. + */ + bool memslots_have_rmaps; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e3b98537b149..16375ddb5dd0 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5566,6 +5566,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) kvm_mmu_init_tdp_mmu(kvm); + kvm->arch.memslots_have_rmaps = true; + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 666989492914..6ed0f05e624c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11030,7 +11030,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, +static int kvm_alloc_memslot_metadata(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages) { int i, r; @@ -11042,9 +11043,11 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, */ memset(&slot->arch, 0, sizeof(slot->arch)); - r = memslot_rmap_alloc(slot, npages); - if (r) - return r; + if (kvm->arch.memslots_have_rmaps) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + } for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; @@ -11115,7 +11118,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(memslot, + return kvm_alloc_memslot_metadata(kvm, memslot, mem->memory_size >> PAGE_SHIFT); return 0; } -- Gitee From 7eb46ab3d33ed664d9d4fe0e3cdf3232a9099e11 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:13 -0700 Subject: [PATCH 139/158] KVM: x86/mmu: Skip rmap operations if rmaps not allocated mainline inclusion from mainline-v5.14-rc1 commit e2209710ccc5d28d8b88c822d2f3e03b269a2856 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e2209710ccc5d28d8b88c822d2f3e03b269a2856 ---------------------------------------------------------------------- If only the TDP MMU is being used to manage the memory mappings for a VM, then many rmap operations can be skipped as they are guaranteed to be no-ops. This saves some time which would be spent on the rmap operation. It also avoids acquiring the MMU lock in write mode for many operations. This makes it safe to run the VM without rmaps allocated, when only using the TDP MMU and sets the stage for waiting to allocate the rmaps until they're needed. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-7-bgardon@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/kvm/mmu/mmu.c Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu.h | 5 ++ arch/x86/kvm/mmu/mmu.c | 133 ++++++++++++++++++++++++----------------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 84 insertions(+), 56 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index d328ce51dddf..319a49233cfc 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -221,4 +221,9 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + return kvm->arch.memslots_have_rmaps; +} + #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 16375ddb5dd0..55cccf303967 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1210,6 +1210,10 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1239,6 +1243,10 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1278,9 +1286,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + if (kvm_memslots_have_rmaps(kvm)) { + for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); + } } if (is_tdp_mmu_enabled(kvm)) @@ -1451,9 +1461,10 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1463,9 +1474,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1518,9 +1530,10 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1530,9 +1543,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -5589,29 +5603,29 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; - write_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; - - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + flush = slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, + end - 1, true, flush); + } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - - write_unlock(&kvm->mmu_lock); - if (is_tdp_mmu_enabled(kvm)) { flush = false; @@ -5638,12 +5652,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, int start_level) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5713,16 +5730,15 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush; - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - read_lock(&kvm->mmu_lock); flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); if (flush) @@ -5749,11 +5765,14 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5777,10 +5796,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, { bool flush; - write_lock(&kvm->mmu_lock); - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5798,9 +5819,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, { bool flush; - write_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ed0f05e624c..f861a664da65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11043,7 +11043,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, */ memset(&slot->arch, 0, sizeof(slot->arch)); - if (kvm->arch.memslots_have_rmaps) { + if (kvm_memslots_have_rmaps(kvm)) { r = memslot_rmap_alloc(slot, npages); if (r) return r; -- Gitee From 31cbf13b8b31294c35bc86913bec7cb865bc9c96 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:14 -0700 Subject: [PATCH 140/158] KVM: x86/mmu: Lazily allocate memslot rmaps mainline inclusion from mainline-v5.14-rc1 commit d501f747ef5c0ac0c917f9a6781d04ae4ae39d63 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d501f747ef5c0ac0c917f9a6781d04ae4ae39d63 ---------------------------------------------------------------------- If the TDP MMU is in use, wait to allocate the rmaps until the shadow MMU is actually used. (i.e. a nested VM is launched.) This saves memory equal to 0.2% of guest memory in cases where the TDP MMU is used and there are no nested guests involved. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-8-bgardon@google.com> Signed-off-by: Paolo Bonzini conflict: arch/x86/include/asm/kvm_host.h Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.h | 7 ++++- arch/x86/kvm/mmu/mmu.c | 14 +++++++--- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++-- arch/x86/kvm/mmu/tdp_mmu.h | 4 +-- arch/x86/kvm/x86.c | 46 +++++++++++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 101686e245f1..afa82fe4809c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1847,4 +1847,5 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define GET_SMSTATE(type, buf, offset) \ (*(type *)((buf) + (offset) - 0x7e00)) +int alloc_all_memslots_rmaps(struct kvm *kvm); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 319a49233cfc..14d9fdff1fb0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -223,7 +223,12 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { - return kvm->arch.memslots_have_rmaps; + /* + * Read memslot_have_rmaps before rmap pointers. Hence, threads reading + * memslots_have_rmaps in any lock context are guaranteed to see the + * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + */ + return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 55cccf303967..e237f4a6e68b 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3298,6 +3298,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } + r = alloc_all_memslots_rmaps(vcpu->kvm); + if (r) + return r; + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) @@ -5578,9 +5582,13 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - kvm_mmu_init_tdp_mmu(kvm); - - kvm->arch.memslots_have_rmaps = true; + if (!kvm_mmu_init_tdp_mmu(kvm)) + /* + * No smp_load/store wrappers needed here as we are in + * VM init and there cannot be any memslots / other threads + * accessing this struct kvm yet. + */ + kvm->arch.memslots_have_rmaps = true; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e0d63ce83673..aa56f0b4c0f3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,10 +14,10 @@ static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return; + return false; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; @@ -25,6 +25,8 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + + return true; } static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index bf83ce179c17..3b66e7807226 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -76,12 +76,12 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); #ifdef CONFIG_X86_64 -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } #else -static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f861a664da65..e3ff20bf1ec0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11020,6 +11020,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, int lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; + WARN_ON(slot->arch.rmap[i]); + slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { memslot_rmap_free(slot); @@ -11030,6 +11032,50 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } +int alloc_all_memslots_rmaps(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r, i; + + /* + * Check if memslots alreday have rmaps early before acquiring + * the slots_arch_lock below. + */ + if (kvm_memslots_have_rmaps(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Read memslots_have_rmaps again, under the slots arch lock, + * before allocating the rmaps + */ + if (kvm_memslots_have_rmaps(kvm)) { + mutex_unlock(&kvm->slots_arch_lock); + return 0; + } + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + r = memslot_rmap_alloc(slot, slot->npages); + if (r) { + mutex_unlock(&kvm->slots_arch_lock); + return r; + } + } + } + + /* + * Ensure that memslots_have_rmaps becomes true strictly after + * all the rmap pointers are set. + */ + smp_store_release(&kvm->arch.memslots_have_rmaps, true); + mutex_unlock(&kvm->slots_arch_lock); + return 0; +} + static int kvm_alloc_memslot_metadata(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) -- Gitee From d3ac83c51d3e0d00784e51c73d212f5464c4cb0c Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 12:57:10 +1200 Subject: [PATCH 141/158] KVM: x86/mmu: Fix pf_fixed count in tdp_mmu_map_handle_target_level() mainline inclusion from mainline-v5.14-rc1 commit 857f84743e4b78500afae010d866675642e18e90 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=857f84743e4b78500afae010d866675642e18e90 ---------------------------------------------------------------------- Currently pf_fixed is not increased when prefault is true. This is not correct, since prefault here really means "async page fault completed". In that case, the original page fault from the guest was morphed into as async page fault and pf_fixed was not increased. So when prefault indicates async page fault is completed, pf_fixed should be increased. Additionally, currently pf_fixed is also increased even when page fault is spurious, while legacy MMU increases pf_fixed when page fault returns RET_PF_EMULATE or RET_PF_FIXED. To fix above two issues, change to increase pf_fixed when return value is not RET_PF_SPURIOUS (RET_PF_RETRY has already been ruled out by reaching here). More information: https://lore.kernel.org/kvm/cover.1620200410.git.kai.huang@intel.com/T/#mbb5f8083e58a2cd262231512b9211cbe70fc3bd5 Fixes: bb18842e2111 ("kvm: x86/mmu: Add TDP MMU PF handler") Reviewed-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <2ea8b7f5d4f03c99b32bc56fc982e1e4e3d3fc6b.1623717884.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index aa56f0b4c0f3..09ff289d3747 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -955,7 +955,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, rcu_dereference(iter->sptep)); } - if (!prefault) + /* + * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be + * consistent with legacy MMU behavior. + */ + if (ret != RET_PF_SPURIOUS) vcpu->stat.pf_fixed++; return ret; -- Gitee From 351e647c4b2f84fd2c0556237992288b42c47c94 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 12:57:11 +1200 Subject: [PATCH 142/158] KVM: x86/mmu: Fix TDP MMU page table level mainline inclusion from mainline-v5.14-rc1 commit f1b8325508327a302f1d5cd8a4bf51e2c9c72fa9 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f1b8325508327a302f1d5cd8a4bf51e2c9c72fa9 ---------------------------------------------------------------------- TDP MMU iterator's level is identical to page table's actual level. For instance, for the last level page table (whose entry points to one 4K page), iter->level is 1 (PG_LEVEL_4K), and in case of 5 level paging, the iter->level is mmu->shadow_root_level, which is 5. However, struct kvm_mmu_page's level currently is not set correctly when it is allocated in kvm_tdp_mmu_map(). When iterator hits non-present SPTE and needs to allocate a new child page table, currently iter->level, which is the level of the page table where the non-present SPTE belongs to, is used. This results in struct kvm_mmu_page's level always having its parent's level (excpet root table's level, which is initialized explicitly using mmu->shadow_root_level). This is kinda wrong, and not consistent with existing non TDP MMU code. Fortuantely sp->role.level is only used in handle_removed_tdp_mmu_page() and kvm_tdp_mmu_zap_sp(), and they are already aware of this and behave correctly. However to make it consistent with legacy MMU code (and fix the issue that both root page table and its child page table have shadow_root_level), use iter->level - 1 in kvm_tdp_mmu_map(), and change handle_removed_tdp_mmu_page() and kvm_tdp_mmu_zap_sp() accordingly. Reviewed-by: Ben Gardon Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++++---- arch/x86/kvm/mmu/tdp_mmu.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 09ff289d3747..cb425c422089 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -337,7 +337,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, for (i = 0; i < PT64_ENT_PER_PAGE; i++) { sptep = rcu_dereference(pt) + i; - gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); + gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); if (shared) { /* @@ -379,12 +379,12 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, WRITE_ONCE(*sptep, REMOVED_SPTE); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level - 1, + old_child_spte, REMOVED_SPTE, level, shared); } kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -1034,7 +1034,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); child_pt = sp->spt; new_spte = make_nonleaf_spte(child_pt, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3b66e7807226..90221caf1848 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -31,7 +31,7 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1); /* * Don't allow yielding, as the caller may have pending pages to zap -- Gitee From b4170b93f1dba9b243485937bb9137c20d0a727e Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 13:17:04 -0800 Subject: [PATCH 143/158] KVM: x86/mmu: Fix TLB flush range when handling disconnected pt mainline inclusion from mainline-v5.16-rc4 commit 574c3c55e969096cea770eda3375ff35ccf91702 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=574c3c55e969096cea770eda3375ff35ccf91702 ---------------------------------------------------------------------- When recursively clearing out disconnected pts, the range based TLB flush in handle_removed_tdp_mmu_page uses the wrong starting GFN, resulting in the flush mostly missing the affected range. Fix this by using base_gfn for the flush. In response to feedback from David Matlack on the RFC version of this patch, also move a few definitions into the for loop in the function to prevent unintended references to them in the future. Fixes: a066e61f13cf ("KVM: x86/mmu: Factor out handling of removed page tables") CC: stable@vger.kernel.org Signed-off-by: Ben Gardon Message-Id: <20211115211704.2621644-1-bgardon@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cb425c422089..9201cd5268a2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -326,9 +326,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; - u64 old_child_spte; - u64 *sptep; - gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -336,8 +333,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = rcu_dereference(pt) + i; - gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; if (shared) { /* @@ -383,7 +381,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); -- Gitee From ee4b4a55c37f97c33db86a32b435c9d0423a89e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jun 2021 09:29:05 -0700 Subject: [PATCH 144/158] KVM: x86/mmu: Grab nx_lpage_splits as an unsigned long before division mainline inclusion from mainline-v5.14-rc1 commit ade74e1433f32e3fb422e3700d5bab34c57f4f47 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ade74e1433f32e3fb422e3700d5bab34c57f4f47 ---------------------------------------------------------------------- Snapshot kvm->stats.nx_lpage_splits into a local unsigned long to avoid 64-bit division on 32-bit kernels. Casting to an unsigned long is safe because the maximum number of shadow pages, n_max_mmu_pages, is also an unsigned long, i.e. KVM will start recycling shadow pages before the number of splits can exceed a 32-bit value. ERROR: modpost: "__udivdi3" [arch/x86/kvm/kvm.ko] undefined! Fixes: 7ee093d4f3f5 ("KVM: switch per-VM stats to u64") Signed-off-by: Sean Christopherson Message-Id: <20210615162905.2132937-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e237f4a6e68b..c718891cd1c0 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6141,6 +6141,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel static void kvm_recover_nx_lpages(struct kvm *kvm) { + unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6152,7 +6153,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) break; -- Gitee From cb27deacf6d81d56eeb431da3c9e44a1be6680c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Jun 2021 06:42:10 -0400 Subject: [PATCH 145/158] KVM: x86: Stub out is_tdp_mmu_root on 32-bit hosts mainline inclusion from mainline-v5.14-rc1 commit c62efff28bb5eb60d60415a0dd0c864c64be0671 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c62efff28bb5eb60d60415a0dd0c864c64be0671 ---------------------------------------------------------------------- If is_tdp_mmu_root is not inlined, the elimination of TDP MMU calls as dead code might not work out. To avoid this, explicitly declare the stubbed is_tdp_mmu_root on 32-bit hosts. Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 90221caf1848..ab642a59f3c3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -80,12 +80,6 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -#else -static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } -static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -#endif static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) { @@ -102,5 +96,12 @@ static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) return is_tdp_mmu_page(sp) && sp->root_count; } +#else +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +static inline bool is_tdp_mmu_root(hpa_t hpa) { return false; } +#endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- Gitee From 48fbf9765d61a19e26f849bc14f5e04395bc59e9 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:45 +0000 Subject: [PATCH 146/158] KVM: x86/mmu: Remove redundant is_tdp_mmu_root check mainline inclusion from mainline-v5.14-rc1 commit aa23c0ad14228ccfcd0b6f799dd34b348a5f2b1e category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aa23c0ad14228ccfcd0b6f799dd34b348a5f2b1e ---------------------------------------------------------------------- The check for is_tdp_mmu_root in kvm_tdp_mmu_map is redundant because kvm_tdp_mmu_map's only caller (direct_page_fault) already checks is_tdp_mmu_root. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 9201cd5268a2..6af4b9ef09b0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -987,8 +987,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); -- Gitee From 13e6eacccf9c90bf94c7f860c1db58125ca75652 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:46 +0000 Subject: [PATCH 147/158] KVM: x86/mmu: Remove redundant is_tdp_mmu_enabled check mainline inclusion from mainline-v5.14-rc1 commit 0b873fd7fb53ed7343ee7ee166e1373aec02a9cb category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0b873fd7fb53ed7343ee7ee166e1373aec02a9cb ---------------------------------------------------------------------- This check is redundant because the root shadow page will only be a TDP MMU page if is_tdp_mmu_enabled() returns true, and is_tdp_mmu_enabled() never changes for the lifetime of a VM. It's possible that this check was added for performance reasons but it is unlikely that it is useful in practice since to_shadow_page() is cheap. That being said, this patch also caches the return value of is_tdp_mmu_root() in direct_page_fault() since there's no reason to duplicate the call so many times, so performance is not a concern. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- arch/x86/kvm/mmu/tdp_mmu.h | 4 +--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c718891cd1c0..3bb7a05b5292 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3581,7 +3581,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) return reserved; } - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_root(vcpu->arch.mmu->root_hpa)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -3759,6 +3759,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { + bool is_tdp_mmu_fault = is_tdp_mmu_root(vcpu->arch.mmu->root_hpa); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; @@ -3771,7 +3772,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + if (!is_tdp_mmu_fault) { r = fast_page_fault(vcpu, gpa, error_code); if (r != RET_PF_INVALID) return r; @@ -3793,7 +3794,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = RET_PF_RETRY; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_lock(&vcpu->kvm->mmu_lock); else write_lock(&vcpu->kvm->mmu_lock); @@ -3804,7 +3805,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) goto out_unlock; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, pfn, prefault); else @@ -3812,7 +3813,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index ab642a59f3c3..7021f06b86cf 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -81,12 +81,10 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +static inline bool is_tdp_mmu_root(hpa_t hpa) { struct kvm_mmu_page *sp; - if (!is_tdp_mmu_enabled(kvm)) - return false; if (WARN_ON(!VALID_PAGE(hpa))) return false; -- Gitee From 9a02c9eb947f0e0a7b6bf6d7e5f5876e9d134bd3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:47 +0000 Subject: [PATCH 148/158] KVM: x86/mmu: Refactor is_tdp_mmu_root into is_tdp_mmu mainline inclusion from mainline-v5.14-rc1 commit 63c0cac938edfa5d72bfbe8f1eeb9d47b397829c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=63c0cac938edfa5d72bfbe8f1eeb9d47b397829c ---------------------------------------------------------------------- This change simplifies the call sites slightly and also abstracts away the implementation detail of looking at root_hpa as the mechanism for determining if the mmu is the TDP MMU. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3bb7a05b5292..ab23c284a69d 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3581,7 +3581,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) return reserved; } - if (is_tdp_mmu_root(vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -3759,7 +3759,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { - bool is_tdp_mmu_fault = is_tdp_mmu_root(vcpu->arch.mmu->root_hpa); + bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 7021f06b86cf..6608fab14729 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -81,9 +81,10 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -static inline bool is_tdp_mmu_root(hpa_t hpa) +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { struct kvm_mmu_page *sp; + hpa_t hpa = mmu->root_hpa; if (WARN_ON(!VALID_PAGE(hpa))) return false; @@ -99,7 +100,7 @@ static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu_root(hpa_t hpa) { return false; } +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- Gitee From 1ffb77554ced1d745a411d7188d98713ffdcec78 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:48 +0000 Subject: [PATCH 149/158] KVM: x86/mmu: Remove redundant root_hpa checks mainline inclusion from mainline-v5.14-rc1 commit 0485cf8dbe964b6cc485178da6ee8ae7b2d0d15c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0485cf8dbe964b6cc485178da6ee8ae7b2d0d15c ---------------------------------------------------------------------- The root_hpa checks below the top-level check in kvm_mmu_page_fault are theoretically redundant since there is no longer a way for the root_hpa to be reset during a page fault. The details of why are described in commit ddce6208217c ("KVM: x86/mmu: Move root_hpa validity checks to top of page fault handler") __direct_map, kvm_tdp_mmu_map, and get_mmio_spte are all only reachable through kvm_mmu_page_fault, therefore their root_hpa checks are redundant. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 8 -------- arch/x86/kvm/mmu/tdp_mmu.c | 3 --- 2 files changed, 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ab23c284a69d..0c0595ed1814 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2845,9 +2845,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -3576,11 +3573,6 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { - *sptep = 0ull; - return reserved; - } - if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6af4b9ef09b0..b20a9690b019 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -985,9 +985,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int level; int req_level; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); -- Gitee From b2e288d757395b7f94d22b95d2b92eb76382fb2f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 22 Jun 2021 16:09:12 +0100 Subject: [PATCH 150/158] KVM: x86/mmu: Fix uninitialized boolean variable flush mainline inclusion from mainline-v5.14-rc1 commit 31c656570065727028f96c811b5ea9fc61502a18 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=31c656570065727028f96c811b5ea9fc61502a18 ---------------------------------------------------------------------- In the case where kvm_memslots_have_rmaps(kvm) is false the boolean variable flush is not set and is uninitialized. If is_tdp_mmu_enabled(kvm) is true then the call to kvm_tdp_mmu_zap_collapsible_sptes passes the uninitialized value of flush into the call. Fix this by initializing flush to false. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: e2209710ccc5 ("KVM: x86/mmu: Skip rmap operations if rmaps not allocated") Signed-off-by: Colin Ian King Reviewed-by: Sean Christopherson Message-Id: <20210622150912.23429-1-colin.king@canonical.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c0595ed1814..47ced53436af 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5729,7 +5729,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; - bool flush; + bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); -- Gitee From 6cb1f60b59d7982d63ee8eb4ddfb2004ea062d1d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 00:24:54 -0700 Subject: [PATCH 151/158] KVM: x86/mmu: Don't WARN on a NULL shadow page in TDP MMU check mainline inclusion from mainline-v5.14-rc1 commit 6c6e166b2c8513721d166c74060d26d3f4aecb48 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6c6e166b2c8513721d166c74060d26d3f4aecb48 ---------------------------------------------------------------------- Treat a NULL shadow page in the "is a TDP MMU" check as valid, non-TDP root. KVM uses a "direct" PAE paging MMU when TDP is disabled and the guest is running with paging disabled. In that case, root_hpa points at the pae_root page (of which only 32 bytes are used), not a standard shadow page, and the WARN fires (a lot). Fixes: 0b873fd7fb53 ("KVM: x86/mmu: Remove redundant is_tdp_mmu_enabled check") Cc: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20210622072454.3449146-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/tdp_mmu.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 6608fab14729..83dbc426a937 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -89,11 +89,13 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) if (WARN_ON(!VALID_PAGE(hpa))) return false; + /* + * A NULL shadow page is legal when shadowing a non-paging guest with + * PAE paging, as the MMU will be direct with root_hpa pointing at the + * pae_root page, not a shadow page. + */ sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; - - return is_tdp_mmu_page(sp) && sp->root_count; + return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } -- Gitee From c7324f648d9b7bc698cf46b5ca06c48e56d7cc79 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:46 -0700 Subject: [PATCH 152/158] KVM: x86/mmu: Remove broken WARN that fires on 32-bit KVM w/ nested EPT mainline inclusion from mainline-v5.14-rc1 commit f0d4379087d8a83f478b371ff7786e8df0cc2314 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f0d4379087d8a83f478b371ff7786e8df0cc2314 ---------------------------------------------------------------------- Remove a misguided WARN that attempts to detect the scenario where using a special A/D tracking flag will set reserved bits on a non-MMIO spte. The WARN triggers false positives when using EPT with 32-bit KVM because of the !64-bit clause, which is just flat out wrong. The whole A/D tracking goo is specific to EPT, and one of the big selling points of EPT is that EPT is decoupled from the host's native paging mode. Drop the WARN instead of trying to salvage the check. Keeping a check specific to A/D tracking bits would essentially regurgitate the same code that led to KVM needed the tracking bits in the first place. A better approach would be to add a generic WARN on reserved bits being set, which would naturally cover the A/D tracking bits, work for all flavors of paging, and be self-documenting to some extent. Fixes: 8a406c89532c ("KVM: x86/mmu: Rename and document A/D scheme for TDP SPTEs") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/spte.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 51e58543bb83..9e0eaf9da41f 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -99,13 +99,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, else if (kvm_vcpu_ad_need_write_protect(vcpu)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; - /* - * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set - * if PAE paging may be employed (shadow paging or any 32-bit KVM). - */ - WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && - (spte & SPTE_TDP_AD_MASK)); - /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, -- Gitee From 9d1da280950c36d423d0f223248f182f9bb9ac5c Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:52 +0000 Subject: [PATCH 153/158] KVM: x86/mmu: Rename cr2_or_gpa to gpa in fast_page_fault mainline inclusion from mainline-v5.15-rc1 commit 76cd325ea75bb2a84c329782d7b8015b6a970c34 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76cd325ea75bb2a84c329782d7b8015b6a970c34 ---------------------------------------------------------------------- fast_page_fault is only called from direct_page_fault where we know the address is a gpa. Fixes: 736c291c9f36 ("KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM") Reviewed-by: Ben Gardon Reviewed-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 47ced53436af..8322c01c9990 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3022,8 +3022,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -3039,7 +3038,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) + for_each_shadow_entry_lockless(vcpu, gpa, iterator, spte) if (!is_shadow_present_pte(spte)) break; @@ -3118,8 +3117,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, iterator.sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; -- Gitee From 9b2f21030b79b7551fbf80012e4d0bd51f091b47 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:53 +0000 Subject: [PATCH 154/158] KVM: x86/mmu: Fix use of enums in trace_fast_page_fault mainline inclusion from mainline-v5.15-rc1 commit 61bcd360aa9851cec969b7b40f23fd1faa7f85a4 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=61bcd360aa9851cec969b7b40f23fd1faa7f85a4 ---------------------------------------------------------------------- Enum values have to be exported to userspace since the formatting is not done in the kernel. Without doing this perf maps RET_PF_FIXED and RET_PF_SPURIOUS to 0, which results in incorrect output: $ perf record -a -e kvmmmu:fast_page_fault --filter "ret==3" -- ./access_tracking_perf_test $ perf script | head -1 [...] new 610006048d25877 spurious 0 fixed 0 <------ should be 1 Fix this by exporting the enum values to userspace with TRACE_DEFINE_ENUM. Fixes: c4371c2a682e ("KVM: x86/mmu: Return unique RET_PF_* values if the fault was fixed") Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 3 +++ arch/x86/kvm/mmu/mmutrace.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 610ec9ee1052..090a4e24a427 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -130,6 +130,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index e798489b56b5..bbae3488f8e0 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ -- Gitee From ad2f6263a2d65464ece60c220b6ffd7270ab8112 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:54 +0000 Subject: [PATCH 155/158] KVM: x86/mmu: Make walk_shadow_page_lockless_{begin,end} interoperate with the TDP MMU mainline inclusion from mainline-v5.15-rc1 commit c5c8c7c53004cb70715320018c3b4287071c1cfd category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5c8c7c53004cb70715320018c3b4287071c1cfd ---------------------------------------------------------------------- Acquire the RCU read lock in walk_shadow_page_lockless_begin and release it in walk_shadow_page_lockless_end when the TDP MMU is enabled. This should not introduce any functional changes but is used in the following commit to make fast_page_fault interoperate with the TDP MMU. Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-4-dmatlack@google.com> [Use if...else instead of if(){return;}] Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 52 +++++++++++++++++++++++--------------- arch/x86/kvm/mmu/tdp_mmu.c | 6 ++--- arch/x86/kvm/mmu/tdp_mmu.h | 10 ++++++++ 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8322c01c9990..25c8bee17ac6 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -614,28 +614,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -3536,6 +3544,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3543,8 +3553,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3558,8 +3566,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3571,11 +3577,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index b20a9690b019..23d8b04b8cf1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1571,6 +1571,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1582,14 +1584,10 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 83dbc426a937..efe0b685f187 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -72,6 +72,16 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -- Gitee From a75608973f9c17a5470a37664e53911a8df8e0ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:18:15 -0700 Subject: [PATCH 156/158] KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock mainline inclusion from mainline-v5.14-rc6 commit ce25681d59ffc4303321e555a2d71b1946af07da category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce25681d59ffc4303321e555a2d71b1946af07da ---------------------------------------------------------------------- Add yet another spinlock for the TDP MMU and take it when marking indirect shadow pages unsync. When using the TDP MMU and L1 is running L2(s) with nested TDP, KVM may encounter shadow pages for the TDP entries managed by L1 (controlling L2) when handling a TDP MMU page fault. The unsync logic is not thread safe, e.g. the kvm_mmu_page fields are not atomic, and misbehaves when a shadow page is marked unsync via a TDP MMU page fault, which runs with mmu_lock held for read, not write. Lack of a critical section manifests most visibly as an underflow of unsync_children in clear_unsync_child_bit() due to unsync_children being corrupted when multiple CPUs write it without a critical section and without atomic operations. But underflow is the best case scenario. The worst case scenario is that unsync_children prematurely hits '0' and leads to guest memory corruption due to KVM neglecting to properly sync shadow pages. Use an entirely new spinlock even though piggybacking tdp_mmu_pages_lock would functionally be ok. Usurping the lock could degrade performance when building upper level page tables on different vCPUs, especially since the unsync flow could hold the lock for a comparatively long time depending on the number of indirect shadow pages and the depth of the paging tree. For simplicity, take the lock for all MMUs, even though KVM could fairly easily know that mmu_lock is held for write. If mmu_lock is held for write, there cannot be contention for the inner spinlock, and marking shadow pages unsync across multiple vCPUs will be slow enough that bouncing the kvm_arch cacheline should be in the noise. Note, even though L2 could theoretically be given access to its own EPT entries, a nested MMU must hold mmu_lock for write and thus cannot race against a TDP MMU page fault. I.e. the additional spinlock only _needs_ to be taken by the TDP MMU, as opposed to being taken by any MMU for a VM that is running with the TDP MMU enabled. Holding mmu_lock for read also prevents the indirect shadow page from being freed. But as above, keep it simple and always take the lock. Alternative #1, the TDP MMU could simply pass "false" for can_unsync and effectively disable unsync behavior for nested TDP. Write protecting leaf shadow pages is unlikely to noticeably impact traditional L1 VMMs, as such VMMs typically don't modify TDP entries, but the same may not hold true for non-standard use cases and/or VMMs that are migrating physical pages (from L1's perspective). Alternative #2, the unsync logic could be made thread safe. In theory, simply converting all relevant kvm_mmu_page fields to atomics and using atomic bitops for the bitmap would suffice. However, (a) an in-depth audit would be required, (b) the code churn would be substantial, and (c) legacy shadow paging would incur additional atomic operations in performance sensitive paths for no benefit (to legacy shadow paging). Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181815.3378104-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- Documentation/virt/kvm/locking.rst | 8 ++++---- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 35eca377543d..88fa495abbac 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -25,10 +25,10 @@ On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock -- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is - taken inside kvm->arch.mmu_lock, and cannot be taken without already - holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise - there's no need to take kvm->arch.tdp_mmu_pages_lock at all). +- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and + kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and + cannot be taken without already holding kvm->arch.mmu_lock (typically with + ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). Everything else is a leaf: no other lock is taken inside the critical sections. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index afa82fe4809c..678955cd2175 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -938,6 +938,13 @@ struct kvm_arch { struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; + /* + * Protects marking pages unsync during page faults, as TDP MMU page + * faults only take mmu_lock for read. For simplicity, the unsync + * pages lock is always taken when marking pages unsync regardless of + * whether mmu_lock is held for read or write. + */ + spinlock_t mmu_unsync_pages_lock; struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 25c8bee17ac6..3d2a35a36564 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2478,6 +2478,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + bool locked = false; if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) return true; @@ -2489,9 +2490,34 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, if (sp->unsync) continue; + /* + * TDP MMU page faults require an additional spinlock as they + * run with mmu_lock held for read, not write, and the unsync + * logic is not thread safe. Take the spinklock regardless of + * the MMU type to avoid extra conditionals/parameters, there's + * no meaningful penalty if mmu_lock is held for write. + */ + if (!locked) { + locked = true; + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * Recheck after taking the spinlock, a different vCPU + * may have since marked the page unsync. A false + * positive on the unprotected check above is not + * possible as clearing sp->unsync _must_ hold mmu_lock + * for write, i.e. unsync cannot transition from 0->1 + * while this CPU holds mmu_lock for read (or write). + */ + if (READ_ONCE(sp->unsync)) + continue; + } + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } + if (locked) + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -5583,6 +5609,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + if (!kvm_mmu_init_tdp_mmu(kvm)) /* * No smp_load/store wrappers needed here as we are in -- Gitee From 12046f9fb8c9f3f8f1563c5b93f5fddc54542baa Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 13 Jul 2021 22:09:55 +0000 Subject: [PATCH 157/158] KVM: x86/mmu: fast_page_fault support for the TDP MMU mainline inclusion from mainline-v5.15-rc1 commit 6e8eb2060cc7fbc4d388d0ab70502e265aec98c6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6e8eb2060cc7fbc4d388d0ab70502e265aec98c6 ---------------------------------------------------------------------- Make fast_page_fault interoperate with the TDP MMU by leveraging walk_shadow_page_lockless_{begin,end} to acquire the RCU read lock and introducing a new helper function kvm_tdp_mmu_fast_pf_get_last_sptep to grab the lowest level sptep. Suggested-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20210713220957.3493520-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu.c | 50 ++++++++++++++++++++++++++++---------- arch/x86/kvm/mmu/tdp_mmu.c | 41 +++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 2 ++ 3 files changed, 80 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3d2a35a36564..891dc122b69d 100755 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3053,15 +3053,41 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) return spte & PT_PRESENT_MASK; } +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. + */ +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) +{ + struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) { - struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3072,14 +3098,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3137,8 +3164,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3151,7 +3177,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) } while (true); - trace_fast_page_fault(vcpu, gpa, error_code, iterator.sptep, spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3798,11 +3824,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 23d8b04b8cf1..83cfa162235b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -525,6 +525,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -1591,3 +1595,40 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index efe0b685f187..8197ba217309 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -84,6 +84,8 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); -- Gitee From 685f4d8ab6200b21f6791a34fedb1e313dbb44ee Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 16 Nov 2023 20:05:00 +0800 Subject: [PATCH 158/158] KVM: x86/mmu: Fix incorrect reference count for TDP MMU root Intel inclusion category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I7S3VQ CVE: NA ---------------------------------------------------------------------- Do not ref-count TDP MMU root when fast zapping, if the root SP is already marked as invalid. Previously, upstream commit 4c6654bd160d ("KVM: x86/mmu: Tear down roots before kvm_mmu_zap_all_fast returns") added logic to zap TDP when memslot deletion happens, to avoid saddling the vCPU thread with the work of tearing down the entire paging structure. However, the root sp is still kept in the invalid list, because the map count is not zero. Later upstream commit f28e9c7fce14 ("KVM: x86/mmu: Fix wrong/misleading comments in TDP MMU fast zap") added checking of root->role.invalid, and gave a warning, if the root was already invalid. We can easily witness such a warning, because the invalidated root sp is still kept in the root list. Although Paolo fixed this, with commit efd995dae5eb ("KVM: x86/mmu: Zap defunct roots via asynchronous worker"), this async solution was later dropped with Sean's commit 0df9dab891ff ("KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously"). So, cook this patch based on both Paolo's and Sean's solution, instead of porting them due to significant denpendencies. Note - we expect the fast zapping path to only zap the root, and to dereference sp's map count, but not to free the it. Because other vCPUs will still see a valid root_hpa, when it later tries to handle the KVM_REQ_MMU_RELOAD. And a use-after-free case may happen, if the sp is already freed in the fast zapping path. Fixes: fe75023f66a0 ("KVM: x86/mmu: Fix wrong/misleading comments in TDP MMU fast zap") Tested-and-reported-by: Hua Su Signed-off-by: Yu Zhang --- arch/x86/kvm/mmu/mmu_internal.h | 8 +++++++- arch/x86/kvm/mmu/tdp_mmu.c | 13 +++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 090a4e24a427..b6a08efad25d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -26,7 +26,13 @@ struct kvm_mmu_page { struct list_head lpage_disallowed_link; bool unsync; - u8 mmu_valid_gen; + + union { + u8 mmu_valid_gen; + + /* Only accessed under slots_lock. */ + bool tdp_mmu_scheduled_root_to_zap; + }; bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 83cfa162235b..27e18b82f8d5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -853,6 +853,11 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) while (root) { next_root = next_invalidated_root(kvm, root); + if (!root->tdp_mmu_scheduled_root_to_zap) + goto next; + + root->tdp_mmu_scheduled_root_to_zap = false; + rcu_read_unlock(); flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, @@ -860,10 +865,11 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) /* * Put the reference acquired in - * kvm_tdp_mmu_invalidate_roots + * kvm_tdp_mmu_invalidate_all_roots */ kvm_tdp_mmu_put_root(kvm, root, true); +next: root = next_root; rcu_read_lock(); @@ -905,8 +911,11 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) lockdep_assert_held_write(&kvm->mmu_lock); list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { - if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(kvm, root))) + if (!root ->role.invalid && + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(kvm, root))) { + root->tdp_mmu_scheduled_root_to_zap = true; root->role.invalid = true; + } } } -- Gitee