From e1bdb19df62cacc362193c2ea8ecb7054ba68b27 Mon Sep 17 00:00:00 2001 From: Zhuang Yanying Date: Wed, 2 Aug 2023 11:09:06 +0800 Subject: [PATCH 1/2] KVM: fix overflow of zero page refcount with ksm running MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.10 commit 7df003c8521 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7PQPE?from=project-issue CVE: NA -------------------------------- We are testing Virtual Machine with KSM on v5.4-rc2 kernel, and found the zero_page refcount overflow. The cause of refcount overflow is increased in try_async_pf (get_user_page) without being decreased in mmu_set_spte() while handling ept violation. In kvm_release_pfn_clean(), only unreserved page will call put_page. However, zero page is reserved. So, as well as creating and destroy vm, the refcount of zero page will continue to increase until it overflows. step1: echo 10000 > /sys/kernel/pages_to_scan/pages_to_scan echo 1 > /sys/kernel/pages_to_scan/run echo 1 > /sys/kernel/pages_to_scan/use_zero_pages step2: just create several normal qemu kvm vms. And destroy it after 10s. Repeat this action all the time. After a long period of time, all domains hang because of the refcount of zero page overflow. Qemu print error log as follow: … error: kvm run failed Bad address EAX=00006cdc EBX=00000008 ECX=80202001 EDX=078bfbfd ESI=ffffffff EDI=00000000 EBP=00000008 ESP=00006cc4 EIP=000efd75 EFL=00010002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] CS =0008 00000000 ffffffff 00c09b00 DPL=0 CS32 [-RA] SS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] DS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] FS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] GS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy GDT= 000f7070 00000037 IDT= 000f70ae 00000000 CR0=00000011 CR2=00000000 CR3=00000000 CR4=00000000 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 EFER=0000000000000000 Code=00 01 00 00 00 e9 e8 00 00 00 c7 05 4c 55 0f 00 01 00 00 00 <8b> 35 00 00 01 00 8b 3d 04 00 01 00 b8 d8 d3 00 00 c1 e0 08 0c ea a3 00 00 01 00 c7 05 04 … Meanwhile, a kernel warning is departed. [40914.836375] WARNING: CPU: 3 PID: 82067 at ./include/linux/mm.h:987 try_get_page+0x1f/0x30 [40914.836412] CPU: 3 PID: 82067 Comm: CPU 0/KVM Kdump: loaded Tainted: G OE 5.2.0-rc2 #5 [40914.836415] RIP: 0010:try_get_page+0x1f/0x30 [40914.836417] Code: 40 00 c3 0f 1f 84 00 00 00 00 00 48 8b 47 08 a8 01 75 11 8b 47 34 85 c0 7e 10 f0 ff 47 34 b8 01 00 00 00 c3 48 8d 78 ff eb e9 <0f> 0b 31 c0 c3 66 90 66 2e 0f 1f 84 00 0 0 00 00 00 48 8b 47 08 a8 [40914.836418] RSP: 0018:ffffb4144e523988 EFLAGS: 00010286 [40914.836419] RAX: 0000000080000000 RBX: 0000000000000326 RCX: 0000000000000000 [40914.836420] RDX: 0000000000000000 RSI: 00004ffdeba10000 RDI: ffffdf07093f6440 [40914.836421] RBP: ffffdf07093f6440 R08: 800000424fd91225 R09: 0000000000000000 [40914.836421] R10: ffff9eb41bfeebb8 R11: 0000000000000000 R12: ffffdf06bbd1e8a8 [40914.836422] R13: 0000000000000080 R14: 800000424fd91225 R15: ffffdf07093f6440 [40914.836423] FS: 00007fb60ffff700(0000) GS:ffff9eb4802c0000(0000) knlGS:0000000000000000 [40914.836425] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [40914.836426] CR2: 0000000000000000 CR3: 0000002f220e6002 CR4: 00000000003626e0 [40914.836427] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [40914.836427] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [40914.836428] Call Trace: [40914.836433] follow_page_pte+0x302/0x47b [40914.836437] __get_user_pages+0xf1/0x7d0 [40914.836441] ? irq_work_queue+0x9/0x70 [40914.836443] get_user_pages_unlocked+0x13f/0x1e0 [40914.836469] __gfn_to_pfn_memslot+0x10e/0x400 [kvm] [40914.836486] try_async_pf+0x87/0x240 [kvm] [40914.836503] tdp_page_fault+0x139/0x270 [kvm] [40914.836523] kvm_mmu_page_fault+0x76/0x5e0 [kvm] [40914.836588] vcpu_enter_guest+0xb45/0x1570 [kvm] [40914.836632] kvm_arch_vcpu_ioctl_run+0x35d/0x580 [kvm] [40914.836645] kvm_vcpu_ioctl+0x26e/0x5d0 [kvm] [40914.836650] do_vfs_ioctl+0xa9/0x620 [40914.836653] ksys_ioctl+0x60/0x90 [40914.836654] __x64_sys_ioctl+0x16/0x20 [40914.836658] do_syscall_64+0x5b/0x180 [40914.836664] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [40914.836666] RIP: 0033:0x7fb61cb6bfc7 Signed-off-by: LinFeng Signed-off-by: Zhuang Yanying Signed-off-by: Paolo Bonzini Signed-off-by: Lei Chen --- virt/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 96fb21ac7224..e4e3dc429ec4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -170,6 +170,7 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) */ if (pfn_valid(pfn)) return PageReserved(pfn_to_page(pfn)) && + !is_zero_pfn(pfn) && !kvm_is_zone_device_pfn(pfn); return true; -- Gitee From 9413304de009be2d0dae1223790f5b384bbc17f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 2 Aug 2023 11:09:07 +0800 Subject: [PATCH 2/2] KVM: Don't set Accessed/Dirty bits for ZERO_PAGE mainline inclusion from mainline-v6.0 commit a1040b0d42acf69bb4f6dbdc54c2dcd78eea1de5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7PQPE?from=project-issue CVE: NA -------------------------------- Don't set Accessed/Dirty bits for a struct page with PG_reserved set, i.e. don't set A/D bits for the ZERO_PAGE. The ZERO_PAGE (or pages depending on the architecture) should obviously never be written, and similarly there's no point in marking it accessed as the page will never be swapped out or reclaimed. The comment in page-flags.h is quite clear that PG_reserved pages should be managed only by their owner, and strictly following that mandate also simplifies KVM's logic. Fixes: 7df003c85218 ("KVM: fix overflow of zero page refcount with ksm running") Signed-off-by: Sean Christopherson Message-Id: <20220429010416.2788472-4-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Lei Chen --- virt/kvm/kvm_main.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4e3dc429ec4..5f4f222c991f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1845,20 +1845,28 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn) } EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); -void kvm_set_pfn_dirty(kvm_pfn_t pfn) +static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { - struct page *page = pfn_to_page(pfn); + if (!pfn_valid(pfn)) + return false; - if (!PageReserved(page)) - SetPageDirty(page); - } + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(pfn_to_page(pfn)); +} + +void kvm_set_pfn_dirty(kvm_pfn_t pfn) +{ + if (kvm_is_ad_tracked_pfn(pfn)) + SetPageDirty(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) + if (kvm_is_ad_tracked_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); -- Gitee